From bb6135880e5e453d7701764b9f2e4ad3356a68d7 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche
Date: Wed, 3 Jul 2019 22:34:46 -0400
Subject: [PATCH] STYLE: Apply black formatting
---
asv_bench/benchmarks/algorithms.py | 123 +-
asv_bench/benchmarks/attrs_caching.py | 6 +-
asv_bench/benchmarks/binary_ops.py | 51 +-
asv_bench/benchmarks/categoricals.py | 114 +-
asv_bench/benchmarks/ctors.py | 52 +-
asv_bench/benchmarks/dtypes.py | 32 +-
asv_bench/benchmarks/eval.py | 31 +-
asv_bench/benchmarks/frame_ctor.py | 22 +-
asv_bench/benchmarks/frame_methods.py | 223 +-
asv_bench/benchmarks/gil.py | 136 +-
asv_bench/benchmarks/groupby.py | 515 +-
asv_bench/benchmarks/index_object.py | 75 +-
asv_bench/benchmarks/indexing.py | 135 +-
asv_bench/benchmarks/indexing_engines.py | 59 +-
asv_bench/benchmarks/inference.py | 75 +-
asv_bench/benchmarks/io/csv.py | 309 +-
asv_bench/benchmarks/io/excel.py | 18 +-
asv_bench/benchmarks/io/hdf.py | 106 +-
asv_bench/benchmarks/io/json.py | 134 +-
asv_bench/benchmarks/io/msgpack.py | 13 +-
asv_bench/benchmarks/io/parsers.py | 18 +-
asv_bench/benchmarks/io/pickle.py | 13 +-
asv_bench/benchmarks/io/sas.py | 20 +-
asv_bench/benchmarks/io/sql.py | 161 +-
asv_bench/benchmarks/io/stata.py | 38 +-
asv_bench/benchmarks/join_merge.py | 274 +-
asv_bench/benchmarks/multiindex_object.py | 84 +-
asv_bench/benchmarks/offset.py | 73 +-
asv_bench/benchmarks/pandas_vb_common.py | 39 +-
asv_bench/benchmarks/period.py | 92 +-
asv_bench/benchmarks/plotting.py | 47 +-
asv_bench/benchmarks/reindex.py | 70 +-
asv_bench/benchmarks/replace.py | 29 +-
asv_bench/benchmarks/reshape.py | 144 +-
asv_bench/benchmarks/rolling.py | 80 +-
asv_bench/benchmarks/series_methods.py | 141 +-
asv_bench/benchmarks/sparse.py | 49 +-
asv_bench/benchmarks/stat_ops.py | 47 +-
asv_bench/benchmarks/strings.py | 84 +-
asv_bench/benchmarks/timedelta.py | 59 +-
asv_bench/benchmarks/timeseries.py | 206 +-
asv_bench/benchmarks/timestamp.py | 35 +-
ci/print_skipped.py | 33 +-
doc/logo/pandas_logo.py | 16 +-
doc/make.py | 266 +-
doc/source/conf.py | 347 +-
doc/sphinxext/announce.py | 48 +-
doc/sphinxext/contributors.py | 24 +-
pandas/__init__.py | 189 +-
pandas/_config/__init__.py | 21 +-
pandas/_config/config.py | 126 +-
pandas/_config/dates.py | 12 +-
pandas/_config/display.py | 11 +-
pandas/_config/localization.py | 22 +-
pandas/_libs/__init__.py | 9 +-
pandas/_typing.py | 28 +-
pandas/_version.py | 123 +-
pandas/api/extensions/__init__.py | 14 +-
pandas/api/types/__init__.py | 6 +-
pandas/arrays/__init__.py | 27 +-
pandas/compat/__init__.py | 14 +-
pandas/compat/_optional.py | 17 +-
pandas/compat/chainmap.py | 1 -
pandas/compat/numpy/__init__.py | 47 +-
pandas/compat/numpy/function.py | 296 +-
pandas/compat/pickle_compat.py | 136 +-
pandas/conftest.py | 274 +-
pandas/core/accessor.py | 67 +-
pandas/core/algorithms.py | 605 ++-
pandas/core/api.py | 19 +-
pandas/core/apply.py | 162 +-
pandas/core/arrays/__init__.py | 5 +-
pandas/core/arrays/_ranges.py | 78 +-
pandas/core/arrays/array_.py | 31 +-
pandas/core/arrays/base.py | 97 +-
pandas/core/arrays/categorical.py | 636 +--
pandas/core/arrays/datetimelike.py | 354 +-
pandas/core/arrays/datetimes.py | 647 ++-
pandas/core/arrays/integer.py | 274 +-
pandas/core/arrays/interval.py | 348 +-
pandas/core/arrays/numpy_.py | 175 +-
pandas/core/arrays/period.py | 267 +-
pandas/core/arrays/sparse.py | 462 +-
pandas/core/arrays/timedeltas.py | 249 +-
pandas/core/base.py | 368 +-
pandas/core/common.py | 50 +-
pandas/core/computation/align.py | 36 +-
pandas/core/computation/check.py | 5 +-
pandas/core/computation/common.py | 2 +-
pandas/core/computation/engines.py | 23 +-
pandas/core/computation/eval.py | 95 +-
pandas/core/computation/expr.py | 347 +-
pandas/core/computation/expressions.py | 88 +-
pandas/core/computation/ops.py | 194 +-
pandas/core/computation/pytables.py | 205 +-
pandas/core/computation/scope.py | 66 +-
pandas/core/config_init.py | 351 +-
pandas/core/dtypes/api.py | 55 +-
pandas/core/dtypes/base.py | 16 +-
pandas/core/dtypes/cast.py | 342 +-
pandas/core/dtypes/common.py | 228 +-
pandas/core/dtypes/concat.py | 167 +-
pandas/core/dtypes/dtypes.py | 243 +-
pandas/core/dtypes/generic.py | 125 +-
pandas/core/dtypes/inference.py | 47 +-
pandas/core/dtypes/missing.py | 109 +-
pandas/core/frame.py | 1980 +++++---
pandas/core/generic.py | 2806 +++++++----
pandas/core/groupby/__init__.py | 5 +-
pandas/core/groupby/base.py | 67 +-
pandas/core/groupby/categorical.py | 15 +-
pandas/core/groupby/generic.py | 467 +-
pandas/core/groupby/groupby.py | 600 ++-
pandas/core/groupby/grouper.py | 230 +-
pandas/core/groupby/ops.py | 351 +-
pandas/core/index.py | 28 +-
pandas/core/indexes/accessors.py | 112 +-
pandas/core/indexes/api.py | 83 +-
pandas/core/indexes/base.py | 1376 +++---
pandas/core/indexes/category.py | 253 +-
pandas/core/indexes/datetimelike.py | 182 +-
pandas/core/indexes/datetimes.py | 582 ++-
pandas/core/indexes/frozen.py | 27 +-
pandas/core/indexes/interval.py | 524 +-
pandas/core/indexes/multi.py | 944 ++--
pandas/core/indexes/numeric.py | 217 +-
pandas/core/indexes/period.py | 406 +-
pandas/core/indexes/range.py | 206 +-
pandas/core/indexes/timedeltas.py | 262 +-
pandas/core/indexing.py | 531 +-
pandas/core/internals/__init__.py | 26 +-
pandas/core/internals/blocks.py | 1119 +++--
pandas/core/internals/concat.py | 149 +-
pandas/core/internals/construction.py | 236 +-
pandas/core/internals/managers.py | 641 ++-
pandas/core/missing.py | 292 +-
pandas/core/nanops.py | 297 +-
pandas/core/ops.py | 828 ++--
pandas/core/resample.py | 510 +-
pandas/core/reshape/concat.py | 212 +-
pandas/core/reshape/melt.py | 94 +-
pandas/core/reshape/merge.py | 920 ++--
pandas/core/reshape/pivot.py | 267 +-
pandas/core/reshape/reshape.py | 324 +-
pandas/core/reshape/tile.py | 182 +-
pandas/core/reshape/util.py | 9 +-
pandas/core/series.py | 1041 ++--
pandas/core/sorting.py | 106 +-
pandas/core/sparse/frame.py | 396 +-
pandas/core/sparse/scipy_sparse.py | 49 +-
pandas/core/sparse/series.py | 277 +-
pandas/core/strings.py | 798 +--
pandas/core/tools/datetimes.py | 391 +-
pandas/core/tools/numeric.py | 48 +-
pandas/core/tools/timedeltas.py | 57 +-
pandas/core/util/hashing.py | 132 +-
pandas/core/window.py | 964 ++--
pandas/errors/__init__.py | 11 +-
pandas/io/clipboard/__init__.py | 41 +-
pandas/io/clipboard/clipboards.py | 64 +-
pandas/io/clipboard/exceptions.py | 1 -
pandas/io/clipboard/windows.py | 49 +-
pandas/io/clipboards.py | 55 +-
pandas/io/common.py | 149 +-
pandas/io/date_converters.py | 15 +-
pandas/io/excel/_base.py | 365 +-
pandas/io/excel/_odfreader.py | 44 +-
pandas/io/excel/_openpyxl.py | 109 +-
pandas/io/excel/_util.py | 43 +-
pandas/io/excel/_xlrd.py | 38 +-
pandas/io/excel/_xlsxwriter.py | 240 +-
pandas/io/excel/_xlwt.py | 59 +-
pandas/io/feather_format.py | 36 +-
pandas/io/formats/console.py | 17 +-
pandas/io/formats/css.py | 156 +-
pandas/io/formats/csvs.py | 188 +-
pandas/io/formats/excel.py | 456 +-
pandas/io/formats/format.py | 699 +--
pandas/io/formats/html.py | 285 +-
pandas/io/formats/latex.py | 151 +-
pandas/io/formats/printing.py | 179 +-
pandas/io/formats/style.py | 445 +-
pandas/io/gbq.py | 81 +-
pandas/io/gcs.py | 13 +-
pandas/io/html.py | 254 +-
pandas/io/json/json.py | 581 ++-
pandas/io/json/normalize.py | 74 +-
pandas/io/json/table_schema.py | 152 +-
pandas/io/msgpack/__init__.py | 4 +-
pandas/io/msgpack/exceptions.py | 1 -
pandas/io/packers.py | 732 +--
pandas/io/parquet.py | 136 +-
pandas/io/parsers.py | 1605 +++---
pandas/io/pickle.py | 14 +-
pandas/io/pytables.py | 2055 ++++----
pandas/io/s3.py | 13 +-
pandas/io/sas/sas7bdat.py | 326 +-
pandas/io/sas/sas_constants.py | 140 +-
pandas/io/sas/sas_xport.py | 190 +-
pandas/io/sas/sasreader.py | 38 +-
pandas/io/spss.py | 13 +-
pandas/io/sql.py | 672 ++-
pandas/io/stata.py | 1243 +++--
pandas/plotting/__init__.py | 50 +-
pandas/plotting/_core.py | 380 +-
pandas/plotting/_matplotlib/__init__.py | 81 +-
pandas/plotting/_matplotlib/boxplot.py | 254 +-
pandas/plotting/_matplotlib/compat.py | 12 +-
pandas/plotting/_matplotlib/converter.py | 433 +-
pandas/plotting/_matplotlib/core.py | 613 ++-
pandas/plotting/_matplotlib/hist.py | 316 +-
pandas/plotting/_matplotlib/misc.py | 171 +-
pandas/plotting/_matplotlib/style.py | 24 +-
pandas/plotting/_matplotlib/timeseries.py | 143 +-
pandas/plotting/_matplotlib/tools.py | 103 +-
pandas/plotting/_misc.py | 138 +-
pandas/testing.py | 5 +-
pandas/tests/api/test_api.py | 215 +-
pandas/tests/api/test_types.py | 72 +-
pandas/tests/arithmetic/conftest.py | 150 +-
pandas/tests/arithmetic/test_datetime64.py | 1484 +++---
pandas/tests/arithmetic/test_numeric.py | 544 ++-
pandas/tests/arithmetic/test_object.py | 215 +-
pandas/tests/arithmetic/test_period.py | 641 +--
pandas/tests/arithmetic/test_timedelta64.py | 1071 ++--
pandas/tests/arrays/categorical/common.py | 6 +-
pandas/tests/arrays/categorical/test_algos.py | 78 +-
.../arrays/categorical/test_analytics.py | 144 +-
pandas/tests/arrays/categorical/test_api.py | 261 +-
.../arrays/categorical/test_constructors.py | 293 +-
.../tests/arrays/categorical/test_dtypes.py | 136 +-
.../tests/arrays/categorical/test_indexing.py | 173 +-
.../tests/arrays/categorical/test_missing.py | 36 +-
.../arrays/categorical/test_operators.py | 228 +-
pandas/tests/arrays/categorical/test_repr.py | 116 +-
.../tests/arrays/categorical/test_sorting.py | 38 +-
.../tests/arrays/categorical/test_subclass.py | 13 +-
.../tests/arrays/categorical/test_warnings.py | 12 +-
pandas/tests/arrays/interval/test_interval.py | 75 +-
pandas/tests/arrays/interval/test_ops.py | 56 +-
pandas/tests/arrays/sparse/test_accessor.py | 94 +-
.../tests/arrays/sparse/test_arithmetics.py | 144 +-
pandas/tests/arrays/sparse/test_array.py | 446 +-
pandas/tests/arrays/sparse/test_dtype.py | 182 +-
pandas/tests/arrays/sparse/test_libsparse.py | 266 +-
pandas/tests/arrays/test_array.py | 381 +-
pandas/tests/arrays/test_datetimelike.py | 193 +-
pandas/tests/arrays/test_datetimes.py | 193 +-
pandas/tests/arrays/test_integer.py | 316 +-
pandas/tests/arrays/test_numpy.py | 93 +-
pandas/tests/arrays/test_period.py | 194 +-
pandas/tests/arrays/test_timedeltas.py | 77 +-
pandas/tests/computation/test_compat.py | 15 +-
pandas/tests/computation/test_eval.py | 1173 ++---
pandas/tests/config/test_config.py | 371 +-
pandas/tests/config/test_localization.py | 8 +-
.../dtypes/cast/test_construct_from_scalar.py | 6 +-
.../dtypes/cast/test_construct_ndarray.py | 17 +-
.../dtypes/cast/test_construct_object_arr.py | 6 +-
pandas/tests/dtypes/cast/test_downcast.py | 30 +-
.../dtypes/cast/test_find_common_type.py | 154 +-
.../dtypes/cast/test_infer_datetimelike.py | 13 +-
pandas/tests/dtypes/cast/test_infer_dtype.py | 93 +-
pandas/tests/dtypes/cast/test_promote.py | 575 ++-
pandas/tests/dtypes/cast/test_upcast.py | 90 +-
pandas/tests/dtypes/test_common.py | 480 +-
pandas/tests/dtypes/test_concat.py | 100 +-
pandas/tests/dtypes/test_dtypes.py | 681 +--
pandas/tests/dtypes/test_generic.py | 38 +-
pandas/tests/dtypes/test_inference.py | 978 ++--
pandas/tests/dtypes/test_missing.py | 344 +-
pandas/tests/extension/arrow/bool.py | 32 +-
pandas/tests/extension/arrow/test_bool.py | 4 +-
pandas/tests/extension/base/__init__.py | 8 +-
pandas/tests/extension/base/base.py | 4 +-
pandas/tests/extension/base/constructors.py | 5 +-
pandas/tests/extension/base/dtype.py | 27 +-
pandas/tests/extension/base/getitem.py | 48 +-
pandas/tests/extension/base/groupby.py | 59 +-
pandas/tests/extension/base/interface.py | 9 +-
pandas/tests/extension/base/io.py | 13 +-
pandas/tests/extension/base/methods.py | 136 +-
pandas/tests/extension/base/missing.py | 55 +-
pandas/tests/extension/base/ops.py | 19 +-
pandas/tests/extension/base/printing.py | 6 +-
pandas/tests/extension/base/reduce.py | 13 +-
pandas/tests/extension/base/reshaping.py | 199 +-
pandas/tests/extension/base/setitem.py | 46 +-
pandas/tests/extension/conftest.py | 25 +-
pandas/tests/extension/decimal/__init__.py | 2 +-
pandas/tests/extension/decimal/array.py | 35 +-
.../tests/extension/decimal/test_decimal.py | 147 +-
pandas/tests/extension/json/__init__.py | 2 +-
pandas/tests/extension/json/array.py | 44 +-
pandas/tests/extension/json/test_json.py | 82 +-
pandas/tests/extension/test_categorical.py | 27 +-
pandas/tests/extension/test_common.py | 31 +-
pandas/tests/extension/test_datetime.py | 94 +-
pandas/tests/extension/test_external_block.py | 21 +-
pandas/tests/extension/test_integer.py | 59 +-
pandas/tests/extension/test_interval.py | 7 +-
pandas/tests/extension/test_numpy.py | 62 +-
pandas/tests/extension/test_period.py | 28 +-
pandas/tests/extension/test_sparse.py | 90 +-
pandas/tests/frame/common.py | 114 +-
pandas/tests/frame/conftest.py | 72 +-
pandas/tests/frame/test_alter_axes.py | 1289 ++---
pandas/tests/frame/test_analytics.py | 2001 ++++----
pandas/tests/frame/test_api.py | 264 +-
pandas/tests/frame/test_apply.py | 1013 ++--
pandas/tests/frame/test_arithmetic.py | 303 +-
pandas/tests/frame/test_asof.py | 79 +-
.../tests/frame/test_axis_select_reindex.py | 735 +--
pandas/tests/frame/test_block_internals.py | 461 +-
pandas/tests/frame/test_combine_concat.py | 800 +--
pandas/tests/frame/test_constructors.py | 1827 +++----
pandas/tests/frame/test_convert_to.py | 634 +--
pandas/tests/frame/test_dtypes.py | 1182 +++--
pandas/tests/frame/test_duplicates.py | 300 +-
pandas/tests/frame/test_indexing.py | 1975 ++++----
pandas/tests/frame/test_join.py | 154 +-
pandas/tests/frame/test_missing.py | 721 +--
pandas/tests/frame/test_mutate_columns.py | 209 +-
pandas/tests/frame/test_nonunique_indexes.py | 495 +-
pandas/tests/frame/test_operators.py | 514 +-
pandas/tests/frame/test_period.py | 101 +-
pandas/tests/frame/test_quantile.py | 410 +-
pandas/tests/frame/test_query_eval.py | 709 +--
pandas/tests/frame/test_rank.py | 176 +-
pandas/tests/frame/test_replace.py | 1143 +++--
pandas/tests/frame/test_repr_info.py | 317 +-
pandas/tests/frame/test_reshape.py | 1023 ++--
.../frame/test_sort_values_level_as_str.py | 71 +-
pandas/tests/frame/test_sorting.py | 616 +--
pandas/tests/frame/test_subclass.py | 586 +--
pandas/tests/frame/test_timeseries.py | 590 +--
pandas/tests/frame/test_timezones.py | 179 +-
pandas/tests/frame/test_to_csv.py | 889 ++--
pandas/tests/frame/test_validate.py | 19 +-
pandas/tests/generic/test_frame.py | 220 +-
pandas/tests/generic/test_generic.py | 332 +-
.../generic/test_label_or_level_utils.py | 99 +-
pandas/tests/generic/test_series.py | 151 +-
.../tests/groupby/aggregate/test_aggregate.py | 407 +-
pandas/tests/groupby/aggregate/test_cython.py | 216 +-
pandas/tests/groupby/aggregate/test_other.py | 595 ++-
pandas/tests/groupby/conftest.py | 98 +-
pandas/tests/groupby/test_apply.py | 444 +-
pandas/tests/groupby/test_bin_groupby.py | 59 +-
pandas/tests/groupby/test_categorical.py | 1089 +++--
pandas/tests/groupby/test_counting.py | 125 +-
pandas/tests/groupby/test_filters.py | 338 +-
pandas/tests/groupby/test_function.py | 1259 ++---
pandas/tests/groupby/test_groupby.py | 1187 ++---
pandas/tests/groupby/test_grouping.py | 666 +--
pandas/tests/groupby/test_index_as_string.py | 70 +-
pandas/tests/groupby/test_nth.py | 516 +-
pandas/tests/groupby/test_rank.py | 566 ++-
pandas/tests/groupby/test_timegrouper.py | 845 ++--
pandas/tests/groupby/test_transform.py | 750 +--
pandas/tests/groupby/test_value_counts.py | 48 +-
pandas/tests/groupby/test_whitelist.py | 318 +-
pandas/tests/indexes/common.py | 206 +-
pandas/tests/indexes/conftest.py | 44 +-
pandas/tests/indexes/datetimelike.py | 13 +-
.../indexes/datetimes/test_arithmetic.py | 93 +-
pandas/tests/indexes/datetimes/test_astype.py | 283 +-
.../indexes/datetimes/test_construction.py | 832 ++--
.../indexes/datetimes/test_date_range.py | 699 +--
.../tests/indexes/datetimes/test_datetime.py | 221 +-
.../indexes/datetimes/test_datetimelike.py | 9 +-
.../tests/indexes/datetimes/test_formats.py | 262 +-
.../tests/indexes/datetimes/test_indexing.py | 593 ++-
pandas/tests/indexes/datetimes/test_misc.py | 289 +-
.../tests/indexes/datetimes/test_missing.py | 82 +-
pandas/tests/indexes/datetimes/test_ops.py | 312 +-
.../indexes/datetimes/test_partial_slicing.py | 381 +-
.../indexes/datetimes/test_scalar_compat.py | 272 +-
pandas/tests/indexes/datetimes/test_setops.py | 263 +-
.../tests/indexes/datetimes/test_timezones.py | 956 ++--
pandas/tests/indexes/datetimes/test_tools.py | 2062 ++++----
pandas/tests/indexes/interval/test_astype.py | 139 +-
.../indexes/interval/test_construction.py | 240 +-
.../tests/indexes/interval/test_interval.py | 581 ++-
.../indexes/interval/test_interval_new.py | 213 +-
.../indexes/interval/test_interval_range.py | 229 +-
.../indexes/interval/test_interval_tree.py | 105 +-
pandas/tests/indexes/interval/test_setops.py | 61 +-
pandas/tests/indexes/multi/conftest.py | 47 +-
pandas/tests/indexes/multi/test_analytics.py | 203 +-
pandas/tests/indexes/multi/test_astype.py | 8 +-
pandas/tests/indexes/multi/test_compat.py | 10 +-
.../tests/indexes/multi/test_constructor.py | 481 +-
pandas/tests/indexes/multi/test_contains.py | 66 +-
pandas/tests/indexes/multi/test_conversion.py | 164 +-
pandas/tests/indexes/multi/test_copy.py | 35 +-
pandas/tests/indexes/multi/test_drop.py | 86 +-
pandas/tests/indexes/multi/test_duplicates.py | 156 +-
.../tests/indexes/multi/test_equivalence.py | 36 +-
pandas/tests/indexes/multi/test_format.py | 36 +-
pandas/tests/indexes/multi/test_get_set.py | 154 +-
pandas/tests/indexes/multi/test_indexing.py | 237 +-
pandas/tests/indexes/multi/test_integrity.py | 125 +-
pandas/tests/indexes/multi/test_join.py | 50 +-
pandas/tests/indexes/multi/test_missing.py | 59 +-
pandas/tests/indexes/multi/test_monotonic.py | 131 +-
pandas/tests/indexes/multi/test_names.py | 57 +-
.../indexes/multi/test_partial_indexing.py | 42 +-
pandas/tests/indexes/multi/test_reindex.py | 43 +-
pandas/tests/indexes/multi/test_reshape.py | 100 +-
pandas/tests/indexes/multi/test_set_ops.py | 77 +-
pandas/tests/indexes/multi/test_sorting.py | 138 +-
.../tests/indexes/period/test_arithmetic.py | 87 +-
pandas/tests/indexes/period/test_asfreq.py | 203 +-
pandas/tests/indexes/period/test_astype.py | 78 +-
.../tests/indexes/period/test_construction.py | 398 +-
pandas/tests/indexes/period/test_formats.py | 185 +-
pandas/tests/indexes/period/test_indexing.py | 494 +-
pandas/tests/indexes/period/test_ops.py | 220 +-
.../indexes/period/test_partial_slicing.py | 121 +-
pandas/tests/indexes/period/test_period.py | 344 +-
.../tests/indexes/period/test_period_range.py | 70 +-
.../indexes/period/test_scalar_compat.py | 10 +-
pandas/tests/indexes/period/test_setops.py | 381 +-
pandas/tests/indexes/period/test_tools.py | 322 +-
pandas/tests/indexes/test_base.py | 1791 ++++---
pandas/tests/indexes/test_category.py | 679 +--
pandas/tests/indexes/test_common.py | 75 +-
pandas/tests/indexes/test_frozen.py | 4 +-
pandas/tests/indexes/test_numeric.py | 527 +-
pandas/tests/indexes/test_numpy_compat.py | 58 +-
pandas/tests/indexes/test_range.py | 426 +-
pandas/tests/indexes/test_setops.py | 74 +-
.../indexes/timedeltas/test_arithmetic.py | 187 +-
.../tests/indexes/timedeltas/test_astype.py | 79 +-
.../indexes/timedeltas/test_construction.py | 145 +-
.../tests/indexes/timedeltas/test_formats.py | 108 +-
.../tests/indexes/timedeltas/test_indexing.py | 265 +-
pandas/tests/indexes/timedeltas/test_ops.py | 152 +-
.../timedeltas/test_partial_slicing.py | 59 +-
.../indexes/timedeltas/test_scalar_compat.py | 50 +-
.../tests/indexes/timedeltas/test_setops.py | 98 +-
.../indexes/timedeltas/test_timedelta.py | 189 +-
.../timedeltas/test_timedelta_range.py | 51 +-
pandas/tests/indexes/timedeltas/test_tools.py | 156 +-
pandas/tests/indexing/common.py | 156 +-
pandas/tests/indexing/conftest.py | 27 +-
.../tests/indexing/interval/test_interval.py | 18 +-
.../indexing/interval/test_interval_new.py | 35 +-
pandas/tests/indexing/multiindex/conftest.py | 23 +-
.../multiindex/test_chaining_and_caching.py | 29 +-
.../indexing/multiindex/test_datetime.py | 8 +-
.../tests/indexing/multiindex/test_getitem.py | 210 +-
pandas/tests/indexing/multiindex/test_iloc.py | 82 +-
.../indexing/multiindex/test_indexing_slow.py | 53 +-
pandas/tests/indexing/multiindex/test_ix.py | 41 +-
pandas/tests/indexing/multiindex/test_loc.py | 247 +-
.../indexing/multiindex/test_multiindex.py | 82 +-
.../tests/indexing/multiindex/test_partial.py | 127 +-
.../tests/indexing/multiindex/test_set_ops.py | 25 +-
.../tests/indexing/multiindex/test_setitem.py | 400 +-
.../tests/indexing/multiindex/test_slice.py | 554 ++-
.../tests/indexing/multiindex/test_sorted.py | 55 +-
pandas/tests/indexing/multiindex/test_xs.py | 190 +-
pandas/tests/indexing/test_callable.py | 167 +-
pandas/tests/indexing/test_categorical.py | 515 +-
.../indexing/test_chaining_and_caching.py | 268 +-
pandas/tests/indexing/test_coercion.py | 891 ++--
pandas/tests/indexing/test_datetime.py | 238 +-
pandas/tests/indexing/test_floats.py | 786 +--
pandas/tests/indexing/test_iloc.py | 435 +-
pandas/tests/indexing/test_indexing.py | 929 ++--
.../tests/indexing/test_indexing_engines.py | 33 +-
pandas/tests/indexing/test_indexing_slow.py | 5 +-
pandas/tests/indexing/test_ix.py | 291 +-
pandas/tests/indexing/test_loc.py | 882 ++--
pandas/tests/indexing/test_partial.py | 317 +-
pandas/tests/indexing/test_scalar.py | 111 +-
pandas/tests/indexing/test_timedelta.py | 101 +-
pandas/tests/internals/test_internals.py | 934 ++--
pandas/tests/io/conftest.py | 34 +-
pandas/tests/io/excel/conftest.py | 5 +-
pandas/tests/io/excel/test_odf.py | 17 +-
pandas/tests/io/excel/test_openpyxl.py | 98 +-
pandas/tests/io/excel/test_readers.py | 767 +--
pandas/tests/io/excel/test_style.py | 143 +-
pandas/tests/io/excel/test_writers.py | 745 +--
pandas/tests/io/excel/test_xlrd.py | 7 +-
pandas/tests/io/excel/test_xlsxwriter.py | 21 +-
pandas/tests/io/excel/test_xlwt.py | 33 +-
pandas/tests/io/formats/test_console.py | 50 +-
pandas/tests/io/formats/test_css.py | 297 +-
.../tests/io/formats/test_eng_formatting.py | 165 +-
pandas/tests/io/formats/test_format.py | 2682 +++++-----
pandas/tests/io/formats/test_printing.py | 111 +-
pandas/tests/io/formats/test_style.py | 1794 ++++---
pandas/tests/io/formats/test_to_csv.py | 477 +-
pandas/tests/io/formats/test_to_excel.py | 471 +-
pandas/tests/io/formats/test_to_html.py | 633 +--
pandas/tests/io/formats/test_to_latex.py | 212 +-
.../tests/io/generate_legacy_storage_files.py | 404 +-
pandas/tests/io/json/test_compression.py | 43 +-
.../tests/io/json/test_json_table_schema.py | 852 ++--
pandas/tests/io/json/test_normalize.py | 774 +--
pandas/tests/io/json/test_pandas.py | 1314 +++--
pandas/tests/io/json/test_readlines.py | 81 +-
pandas/tests/io/json/test_ujson.py | 512 +-
pandas/tests/io/msgpack/test_buffer.py | 11 +-
pandas/tests/io/msgpack/test_case.py | 96 +-
pandas/tests/io/msgpack/test_except.py | 11 +-
pandas/tests/io/msgpack/test_extension.py | 50 +-
pandas/tests/io/msgpack/test_format.py | 101 +-
pandas/tests/io/msgpack/test_limits.py | 15 +-
pandas/tests/io/msgpack/test_newspec.py | 58 +-
pandas/tests/io/msgpack/test_obj.py | 33 +-
pandas/tests/io/msgpack/test_pack.py | 86 +-
pandas/tests/io/msgpack/test_read_size.py | 42 +-
pandas/tests/io/msgpack/test_seq.py | 2 +-
pandas/tests/io/msgpack/test_sequnpack.py | 77 +-
pandas/tests/io/msgpack/test_subtype.py | 2 +-
pandas/tests/io/msgpack/test_unpack.py | 20 +-
pandas/tests/io/msgpack/test_unpack_raw.py | 10 +-
pandas/tests/io/parser/conftest.py | 12 +-
pandas/tests/io/parser/test_c_parser_only.py | 289 +-
pandas/tests/io/parser/test_comment.py | 41 +-
pandas/tests/io/parser/test_common.py | 1056 ++--
pandas/tests/io/parser/test_compression.py | 22 +-
pandas/tests/io/parser/test_converters.py | 42 +-
pandas/tests/io/parser/test_dialect.py | 65 +-
pandas/tests/io/parser/test_dtypes.py | 316 +-
pandas/tests/io/parser/test_header.py | 375 +-
pandas/tests/io/parser/test_index_col.py | 112 +-
pandas/tests/io/parser/test_mangle_dupes.py | 89 +-
pandas/tests/io/parser/test_multi_thread.py | 37 +-
pandas/tests/io/parser/test_na_values.py | 384 +-
pandas/tests/io/parser/test_network.py | 126 +-
pandas/tests/io/parser/test_parse_dates.py | 1357 ++++--
.../io/parser/test_python_parser_only.py | 96 +-
pandas/tests/io/parser/test_quoting.py | 88 +-
pandas/tests/io/parser/test_read_fwf.py | 235 +-
pandas/tests/io/parser/test_skiprows.py | 182 +-
pandas/tests/io/parser/test_textreader.py | 259 +-
pandas/tests/io/parser/test_unsupported.py | 47 +-
pandas/tests/io/parser/test_usecols.py | 377 +-
pandas/tests/io/pytables/test_compat.py | 24 +-
pandas/tests/io/pytables/test_pytables.py | 4292 +++++++++--------
pandas/tests/io/sas/test_sas.py | 11 +-
pandas/tests/io/sas/test_sas7bdat.py | 88 +-
pandas/tests/io/sas/test_xport.py | 17 +-
pandas/tests/io/test_clipboard.py | 191 +-
pandas/tests/io/test_common.py | 267 +-
pandas/tests/io/test_compression.py | 92 +-
pandas/tests/io/test_date_converters.py | 13 +-
pandas/tests/io/test_feather.py | 106 +-
pandas/tests/io/test_gbq.py | 72 +-
pandas/tests/io/test_gcs.py | 65 +-
pandas/tests/io/test_html.py | 652 +--
pandas/tests/io/test_packers.py | 503 +-
pandas/tests/io/test_parquet.py | 386 +-
pandas/tests/io/test_pickle.py | 76 +-
pandas/tests/io/test_s3.py | 8 +-
pandas/tests/io/test_spss.py | 7 +-
pandas/tests/io/test_sql.py | 1734 ++++---
pandas/tests/io/test_stata.py | 1482 +++---
pandas/tests/plotting/common.py | 133 +-
pandas/tests/plotting/test_backend.py | 27 +-
pandas/tests/plotting/test_boxplot_method.py | 290 +-
pandas/tests/plotting/test_converter.py | 154 +-
pandas/tests/plotting/test_datetimelike.py | 607 +--
pandas/tests/plotting/test_frame.py | 1817 +++----
pandas/tests/plotting/test_groupby.py | 38 +-
pandas/tests/plotting/test_hist_method.py | 155 +-
pandas/tests/plotting/test_misc.py | 354 +-
pandas/tests/plotting/test_series.py | 329 +-
pandas/tests/reductions/test_reductions.py | 526 +-
.../tests/reductions/test_stat_reductions.py | 102 +-
pandas/tests/resample/conftest.py | 36 +-
pandas/tests/resample/test_base.py | 95 +-
pandas/tests/resample/test_datetime_index.py | 1277 ++---
pandas/tests/resample/test_period_index.py | 815 ++--
pandas/tests/resample/test_resample_api.py | 505 +-
.../tests/resample/test_resampler_grouper.py | 238 +-
pandas/tests/resample/test_time_grouper.py | 241 +-
pandas/tests/resample/test_timedelta.py | 110 +-
pandas/tests/reshape/merge/test_join.py | 755 +--
pandas/tests/reshape/merge/test_merge.py | 2257 +++++----
pandas/tests/reshape/merge/test_merge_asof.py | 1499 +++---
.../merge/test_merge_index_as_string.py | 99 +-
.../tests/reshape/merge/test_merge_ordered.py | 100 +-
pandas/tests/reshape/merge/test_multi.py | 914 ++--
pandas/tests/reshape/test_concat.py | 1973 ++++----
pandas/tests/reshape/test_cut.py | 366 +-
pandas/tests/reshape/test_melt.py | 1271 +++--
pandas/tests/reshape/test_pivot.py | 2829 ++++++-----
pandas/tests/reshape/test_qcut.py | 131 +-
pandas/tests/reshape/test_reshape.py | 568 +--
.../tests/reshape/test_union_categoricals.py | 228 +-
pandas/tests/reshape/test_util.py | 16 +-
pandas/tests/scalar/interval/test_interval.py | 117 +-
pandas/tests/scalar/interval/test_ops.py | 26 +-
pandas/tests/scalar/period/test_asfreq.py | 1140 ++---
pandas/tests/scalar/period/test_period.py | 1170 ++---
pandas/tests/scalar/test_nat.py | 263 +-
.../tests/scalar/timedelta/test_arithmetic.py | 243 +-
.../scalar/timedelta/test_construction.py | 286 +-
pandas/tests/scalar/timedelta/test_formats.py | 49 +-
.../tests/scalar/timedelta/test_timedelta.py | 649 +--
.../tests/scalar/timestamp/test_arithmetic.py | 56 +-
.../scalar/timestamp/test_comparisons.py | 41 +-
.../tests/scalar/timestamp/test_rendering.py | 38 +-
.../tests/scalar/timestamp/test_timestamp.py | 588 ++-
.../tests/scalar/timestamp/test_timezones.py | 330 +-
.../tests/scalar/timestamp/test_unary_ops.py | 297 +-
pandas/tests/series/common.py | 7 +-
pandas/tests/series/conftest.py | 6 +-
pandas/tests/series/indexing/conftest.py | 2 +-
.../tests/series/indexing/test_alter_index.py | 248 +-
pandas/tests/series/indexing/test_boolean.py | 185 +-
pandas/tests/series/indexing/test_callable.py | 16 +-
pandas/tests/series/indexing/test_datetime.py | 309 +-
pandas/tests/series/indexing/test_indexing.py | 323 +-
pandas/tests/series/indexing/test_loc.py | 32 +-
pandas/tests/series/indexing/test_numeric.py | 155 +-
pandas/tests/series/test_alter_axes.py | 210 +-
pandas/tests/series/test_analytics.py | 938 ++--
pandas/tests/series/test_api.py | 300 +-
pandas/tests/series/test_apply.py | 562 ++-
pandas/tests/series/test_arithmetic.py | 65 +-
pandas/tests/series/test_asof.py | 58 +-
pandas/tests/series/test_block_internals.py | 12 +-
pandas/tests/series/test_combine_concat.py | 284 +-
pandas/tests/series/test_constructors.py | 774 +--
pandas/tests/series/test_datetime_values.py | 539 ++-
pandas/tests/series/test_dtypes.py | 313 +-
pandas/tests/series/test_duplicates.py | 82 +-
pandas/tests/series/test_internals.py | 112 +-
pandas/tests/series/test_io.py | 118 +-
pandas/tests/series/test_missing.py | 1194 +++--
pandas/tests/series/test_operators.py | 327 +-
pandas/tests/series/test_period.py | 126 +-
pandas/tests/series/test_quantile.py | 122 +-
pandas/tests/series/test_rank.py | 463 +-
pandas/tests/series/test_replace.py | 140 +-
pandas/tests/series/test_repr.py | 150 +-
pandas/tests/series/test_sorting.py | 100 +-
pandas/tests/series/test_subclass.py | 59 +-
pandas/tests/series/test_timeseries.py | 574 ++-
pandas/tests/series/test_timezones.py | 235 +-
pandas/tests/series/test_ufunc.py | 121 +-
pandas/tests/series/test_validate.py | 9 +-
pandas/tests/sparse/frame/conftest.py | 33 +-
pandas/tests/sparse/frame/test_analytics.py | 4 +-
pandas/tests/sparse/frame/test_apply.py | 39 +-
pandas/tests/sparse/frame/test_frame.py | 1043 ++--
pandas/tests/sparse/frame/test_indexing.py | 74 +-
pandas/tests/sparse/frame/test_to_csv.py | 9 +-
.../tests/sparse/frame/test_to_from_scipy.py | 60 +-
pandas/tests/sparse/series/test_indexing.py | 82 +-
pandas/tests/sparse/series/test_series.py | 783 +--
pandas/tests/sparse/test_combine_concat.py | 230 +-
pandas/tests/sparse/test_format.py | 125 +-
pandas/tests/sparse/test_groupby.py | 48 +-
pandas/tests/sparse/test_indexing.py | 751 ++-
pandas/tests/sparse/test_pivot.py | 68 +-
pandas/tests/sparse/test_reshape.py | 4 +-
pandas/tests/test_algos.py | 1361 ++++--
pandas/tests/test_base.py | 778 +--
pandas/tests/test_common.py | 57 +-
pandas/tests/test_downstream.py | 36 +-
pandas/tests/test_errors.py | 24 +-
pandas/tests/test_expressions.py | 263 +-
pandas/tests/test_join.py | 196 +-
pandas/tests/test_lib.py | 37 +-
pandas/tests/test_multilevel.py | 1638 ++++---
pandas/tests/test_nanops.py | 815 ++--
pandas/tests/test_optional_dependency.py | 10 +-
pandas/tests/test_register_accessor.py | 46 +-
pandas/tests/test_sorting.py | 265 +-
pandas/tests/test_strings.py | 2511 +++++-----
pandas/tests/test_take.py | 203 +-
pandas/tests/test_window.py | 3378 +++++++------
pandas/tests/tools/test_numeric.py | 371 +-
.../tseries/frequencies/test_freq_code.py | 184 +-
.../tseries/frequencies/test_inference.py | 324 +-
.../tseries/frequencies/test_to_offset.py | 184 +-
pandas/tests/tseries/holiday/test_calendar.py | 38 +-
pandas/tests/tseries/holiday/test_federal.py | 34 +-
pandas/tests/tseries/holiday/test_holiday.py | 289 +-
.../tests/tseries/holiday/test_observance.py | 70 +-
pandas/tests/tseries/offsets/common.py | 14 +-
pandas/tests/tseries/offsets/conftest.py | 10 +-
pandas/tests/tseries/offsets/test_fiscal.py | 613 +--
pandas/tests/tseries/offsets/test_offsets.py | 4208 +++++++++-------
.../offsets/test_offsets_properties.py | 68 +-
pandas/tests/tseries/offsets/test_ticks.py | 174 +-
.../tests/tseries/offsets/test_yqm_offsets.py | 1624 ++++---
pandas/tests/tslibs/test_api.py | 66 +-
pandas/tests/tslibs/test_array_to_datetime.py | 101 +-
pandas/tests/tslibs/test_ccalendar.py | 15 +-
pandas/tests/tslibs/test_conversion.py | 41 +-
pandas/tests/tslibs/test_libfrequencies.py | 146 +-
pandas/tests/tslibs/test_liboffsets.py | 162 +-
pandas/tests/tslibs/test_normalize_date.py | 25 +-
pandas/tests/tslibs/test_parse_iso8601.py | 76 +-
pandas/tests/tslibs/test_parsing.py | 194 +-
pandas/tests/tslibs/test_period_asfreq.py | 125 +-
pandas/tests/tslibs/test_timedeltas.py | 21 +-
pandas/tests/tslibs/test_timezones.py | 30 +-
pandas/tests/util/test_assert_almost_equal.py | 176 +-
.../util/test_assert_categorical_equal.py | 12 +-
.../util/test_assert_extension_array_equal.py | 25 +-
pandas/tests/util/test_assert_frame_equal.py | 124 +-
pandas/tests/util/test_assert_index_equal.py | 35 +-
.../util/test_assert_interval_array_equal.py | 13 +-
.../util/test_assert_numpy_array_equal.py | 46 +-
.../util/test_assert_produces_warning.py | 9 +-
pandas/tests/util/test_assert_series_equal.py | 75 +-
pandas/tests/util/test_deprecate.py | 28 +-
pandas/tests/util/test_deprecate_kwarg.py | 6 +-
pandas/tests/util/test_hashing.py | 176 +-
pandas/tests/util/test_move.py | 1 +
pandas/tests/util/test_safe_import.py | 13 +-
pandas/tests/util/test_util.py | 5 +-
pandas/tests/util/test_validate_args.py | 27 +-
.../util/test_validate_args_and_kwargs.py | 56 +-
pandas/tests/util/test_validate_kwargs.py | 18 +-
pandas/tseries/converter.py | 24 +-
pandas/tseries/frequencies.py | 155 +-
pandas/tseries/holiday.py | 131 +-
pandas/tseries/offsets.py | 936 ++--
pandas/util/__init__.py | 3 +-
pandas/util/_decorators.py | 106 +-
pandas/util/_depr_module.py | 32 +-
pandas/util/_doctools.py | 72 +-
pandas/util/_print_versions.py | 85 +-
pandas/util/_test_decorators.py | 89 +-
pandas/util/_tester.py | 6 +-
pandas/util/_validators.py | 96 +-
pandas/util/testing.py | 1176 +++--
scripts/download_wheels.py | 20 +-
scripts/find_commits_touching_func.py | 131 +-
scripts/generate_pip_deps_from_conda.py | 61 +-
scripts/merge-pr.py | 146 +-
scripts/tests/conftest.py | 7 +-
scripts/tests/test_validate_docstrings.py | 687 ++-
scripts/validate_docstrings.py | 702 +--
setup.py | 821 ++--
versioneer.py | 213 +-
748 files changed, 126206 insertions(+), 97282 deletions(-)
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index b69efb4689486..436093ef195ef 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -5,7 +5,7 @@
import pandas as pd
from pandas.util import testing as tm
-for imp in ['pandas.util', 'pandas.tools.hashing']:
+for imp in ["pandas.util", "pandas.tools.hashing"]:
try:
hashing = import_module(imp)
break
@@ -15,15 +15,17 @@
class Factorize:
- params = [[True, False], ['int', 'uint', 'float', 'string']]
- param_names = ['sort', 'dtype']
+ params = [[True, False], ["int", "uint", "float", "string"]]
+ param_names = ["sort", "dtype"]
def setup(self, sort, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
- 'uint': pd.UInt64Index(np.arange(N).repeat(5)),
- 'float': pd.Float64Index(np.random.randn(N).repeat(5)),
- 'string': tm.makeStringIndex(N).repeat(5)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N).repeat(5)),
+ "uint": pd.UInt64Index(np.arange(N).repeat(5)),
+ "float": pd.Float64Index(np.random.randn(N).repeat(5)),
+ "string": tm.makeStringIndex(N).repeat(5),
+ }
self.idx = data[dtype]
def time_factorize(self, sort, dtype):
@@ -32,15 +34,17 @@ def time_factorize(self, sort, dtype):
class FactorizeUnique:
- params = [[True, False], ['int', 'uint', 'float', 'string']]
- param_names = ['sort', 'dtype']
+ params = [[True, False], ["int", "uint", "float", "string"]]
+ param_names = ["sort", "dtype"]
def setup(self, sort, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N)),
- 'uint': pd.UInt64Index(np.arange(N)),
- 'float': pd.Float64Index(np.arange(N)),
- 'string': tm.makeStringIndex(N)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N)),
+ "uint": pd.UInt64Index(np.arange(N)),
+ "float": pd.Float64Index(np.arange(N)),
+ "string": tm.makeStringIndex(N),
+ }
self.idx = data[dtype]
assert self.idx.is_unique
@@ -50,15 +54,17 @@ def time_factorize(self, sort, dtype):
class Duplicated:
- params = [['first', 'last', False], ['int', 'uint', 'float', 'string']]
- param_names = ['keep', 'dtype']
+ params = [["first", "last", False], ["int", "uint", "float", "string"]]
+ param_names = ["keep", "dtype"]
def setup(self, keep, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N).repeat(5)),
- 'uint': pd.UInt64Index(np.arange(N).repeat(5)),
- 'float': pd.Float64Index(np.random.randn(N).repeat(5)),
- 'string': tm.makeStringIndex(N).repeat(5)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N).repeat(5)),
+ "uint": pd.UInt64Index(np.arange(N).repeat(5)),
+ "float": pd.Float64Index(np.random.randn(N).repeat(5)),
+ "string": tm.makeStringIndex(N).repeat(5),
+ }
self.idx = data[dtype]
# cache is_unique
self.idx.is_unique
@@ -69,15 +75,17 @@ def time_duplicated(self, keep, dtype):
class DuplicatedUniqueIndex:
- params = ['int', 'uint', 'float', 'string']
- param_names = ['dtype']
+ params = ["int", "uint", "float", "string"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**5
- data = {'int': pd.Int64Index(np.arange(N)),
- 'uint': pd.UInt64Index(np.arange(N)),
- 'float': pd.Float64Index(np.random.randn(N)),
- 'string': tm.makeStringIndex(N)}
+ N = 10 ** 5
+ data = {
+ "int": pd.Int64Index(np.arange(N)),
+ "uint": pd.UInt64Index(np.arange(N)),
+ "float": pd.Float64Index(np.random.randn(N)),
+ "string": tm.makeStringIndex(N),
+ }
self.idx = data[dtype]
# cache is_unique
self.idx.is_unique
@@ -87,18 +95,21 @@ def time_duplicated_unique(self, dtype):
class Hashing:
-
def setup_cache(self):
- N = 10**5
+ N = 10 ** 5
df = pd.DataFrame(
- {'strings': pd.Series(tm.makeStringIndex(10000).take(
- np.random.randint(0, 10000, size=N))),
- 'floats': np.random.randn(N),
- 'ints': np.arange(N),
- 'dates': pd.date_range('20110101', freq='s', periods=N),
- 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)})
- df['categories'] = df['strings'].astype('category')
+ {
+ "strings": pd.Series(
+ tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N))
+ ),
+ "floats": np.random.randn(N),
+ "ints": np.arange(N),
+ "dates": pd.date_range("20110101", freq="s", periods=N),
+ "timedeltas": pd.timedelta_range("1 day", freq="s", periods=N),
+ }
+ )
+ df["categories"] = df["strings"].astype("category")
df.iloc[10:20] = np.nan
return df
@@ -106,35 +117,39 @@ def time_frame(self, df):
hashing.hash_pandas_object(df)
def time_series_int(self, df):
- hashing.hash_pandas_object(df['ints'])
+ hashing.hash_pandas_object(df["ints"])
def time_series_string(self, df):
- hashing.hash_pandas_object(df['strings'])
+ hashing.hash_pandas_object(df["strings"])
def time_series_float(self, df):
- hashing.hash_pandas_object(df['floats'])
+ hashing.hash_pandas_object(df["floats"])
def time_series_categorical(self, df):
- hashing.hash_pandas_object(df['categories'])
+ hashing.hash_pandas_object(df["categories"])
def time_series_timedeltas(self, df):
- hashing.hash_pandas_object(df['timedeltas'])
+ hashing.hash_pandas_object(df["timedeltas"])
def time_series_dates(self, df):
- hashing.hash_pandas_object(df['dates'])
+ hashing.hash_pandas_object(df["dates"])
class Quantile:
- params = [[0, 0.5, 1],
- ['linear', 'nearest', 'lower', 'higher', 'midpoint'],
- ['float', 'int', 'uint']]
- param_names = ['quantile', 'interpolation', 'dtype']
+ params = [
+ [0, 0.5, 1],
+ ["linear", "nearest", "lower", "higher", "midpoint"],
+ ["float", "int", "uint"],
+ ]
+ param_names = ["quantile", "interpolation", "dtype"]
def setup(self, quantile, interpolation, dtype):
- N = 10**5
- data = {'int': np.arange(N),
- 'uint': np.arange(N).astype(np.uint64),
- 'float': np.random.randn(N)}
+ N = 10 ** 5
+ data = {
+ "int": np.arange(N),
+ "uint": np.arange(N).astype(np.uint64),
+ "float": np.random.randn(N),
+ }
self.idx = pd.Series(data[dtype].repeat(5))
def time_quantile(self, quantile, interpolation, dtype):
@@ -142,12 +157,12 @@ def time_quantile(self, quantile, interpolation, dtype):
class SortIntegerArray:
- params = [10**3, 10**5]
+ params = [10 ** 3, 10 ** 5]
def setup(self, N):
data = np.arange(N, dtype=float)
data[40] = np.nan
- self.array = pd.array(data, dtype='Int64')
+ self.array = pd.array(data, dtype="Int64")
def time_argsort(self, N):
self.array.argsort()
diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py
index dd316a2bc88d0..c43e5dfd729aa 100644
--- a/asv_bench/benchmarks/attrs_caching.py
+++ b/asv_bench/benchmarks/attrs_caching.py
@@ -1,5 +1,6 @@
import numpy as np
from pandas import DataFrame
+
try:
from pandas.util import cache_readonly
except ImportError:
@@ -7,7 +8,6 @@
class DataFrameAttributes:
-
def setup(self):
self.df = DataFrame(np.random.randn(10, 6))
self.cur_index = self.df.index
@@ -20,14 +20,12 @@ def time_set_index(self):
class CacheReadonly:
-
def setup(self):
-
class Foo:
-
@cache_readonly
def prop(self):
return 5
+
self.obj = Foo()
def time_cache_readonly(self):
diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py
index 26cd66284c41e..fd3324b78f1c3 100644
--- a/asv_bench/benchmarks/binary_ops.py
+++ b/asv_bench/benchmarks/binary_ops.py
@@ -1,6 +1,7 @@
import numpy as np
from pandas import DataFrame, Series, date_range
from pandas.core.algorithms import checked_add_with_arr
+
try:
import pandas.core.computation.expressions as expr
except ImportError:
@@ -9,14 +10,14 @@
class Ops:
- params = [[True, False], ['default', 1]]
- param_names = ['use_numexpr', 'threads']
+ params = [[True, False], ["default", 1]]
+ param_names = ["use_numexpr", "threads"]
def setup(self, use_numexpr, threads):
self.df = DataFrame(np.random.randn(20000, 100))
self.df2 = DataFrame(np.random.randn(20000, 100))
- if threads != 'default':
+ if threads != "default":
expr.set_numexpr_threads(threads)
if not use_numexpr:
expr.set_use_numexpr(False)
@@ -39,18 +40,21 @@ def teardown(self, use_numexpr, threads):
class Ops2:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df = DataFrame(np.random.randn(N, N))
self.df2 = DataFrame(np.random.randn(N, N))
- self.df_int = DataFrame(np.random.randint(np.iinfo(np.int16).min,
- np.iinfo(np.int16).max,
- size=(N, N)))
- self.df2_int = DataFrame(np.random.randint(np.iinfo(np.int16).min,
- np.iinfo(np.int16).max,
- size=(N, N)))
+ self.df_int = DataFrame(
+ np.random.randint(
+ np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N)
+ )
+ )
+ self.df2_int = DataFrame(
+ np.random.randint(
+ np.iinfo(np.int16).min, np.iinfo(np.int16).max, size=(N, N)
+ )
+ )
self.s = Series(np.random.randn(N))
@@ -90,16 +94,16 @@ def time_frame_series_dot(self):
class Timeseries:
- params = [None, 'US/Eastern']
- param_names = ['tz']
+ params = [None, "US/Eastern"]
+ param_names = ["tz"]
def setup(self, tz):
- N = 10**6
+ N = 10 ** 6
halfway = (N // 2) - 1
- self.s = Series(date_range('20010101', periods=N, freq='T', tz=tz))
+ self.s = Series(date_range("20010101", periods=N, freq="T", tz=tz))
self.ts = self.s[halfway]
- self.s2 = Series(date_range('20010101', periods=N, freq='s', tz=tz))
+ self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz))
def time_series_timestamp_compare(self, tz):
self.s <= self.ts
@@ -117,10 +121,10 @@ def time_timestamp_ops_diff_with_shift(self, tz):
class AddOverflowScalar:
params = [1, -1, 0]
- param_names = ['scalar']
+ param_names = ["scalar"]
def setup(self, scalar):
- N = 10**6
+ N = 10 ** 6
self.arr = np.arange(N)
def time_add_overflow_scalar(self, scalar):
@@ -128,9 +132,8 @@ def time_add_overflow_scalar(self, scalar):
class AddOverflowArray:
-
def setup(self):
- N = 10**6
+ N = 10 ** 6
self.arr = np.arange(N)
self.arr_rev = np.arange(-N, 0)
self.arr_mixed = np.array([1, -1]).repeat(N / 2)
@@ -144,12 +147,12 @@ def time_add_overflow_arr_mask_nan(self):
checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1)
def time_add_overflow_b_mask_nan(self):
- checked_add_with_arr(self.arr, self.arr_mixed,
- b_mask=self.arr_nan_1)
+ checked_add_with_arr(self.arr, self.arr_mixed, b_mask=self.arr_nan_1)
def time_add_overflow_both_arg_nan(self):
- checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1,
- b_mask=self.arr_nan_2)
+ checked_add_with_arr(
+ self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, b_mask=self.arr_nan_2
+ )
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
index f1afca5941fe5..933946b1ca1ac 100644
--- a/asv_bench/benchmarks/categoricals.py
+++ b/asv_bench/benchmarks/categoricals.py
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import pandas.util.testing as tm
+
try:
from pandas.api.types import union_categoricals
except ImportError:
@@ -11,13 +12,12 @@
class Concat:
-
def setup(self):
- N = 10**5
- self.s = pd.Series(list('aabbcd') * N).astype('category')
+ N = 10 ** 5
+ self.s = pd.Series(list("aabbcd") * N).astype("category")
- self.a = pd.Categorical(list('aabbcd') * N)
- self.b = pd.Categorical(list('bbcdjk') * N)
+ self.a = pd.Categorical(list("aabbcd") * N)
+ self.b = pd.Categorical(list("bbcdjk") * N)
def time_concat(self):
pd.concat([self.s, self.s])
@@ -27,23 +27,22 @@ def time_union(self):
class Constructor:
-
def setup(self):
- N = 10**5
- self.categories = list('abcde')
+ N = 10 ** 5
+ self.categories = list("abcde")
self.cat_idx = pd.Index(self.categories)
self.values = np.tile(self.categories, N)
self.codes = np.tile(range(len(self.categories)), N)
- self.datetimes = pd.Series(pd.date_range('1995-01-01 00:00:00',
- periods=N / 10,
- freq='s'))
+ self.datetimes = pd.Series(
+ pd.date_range("1995-01-01 00:00:00", periods=N / 10, freq="s")
+ )
self.datetimes_with_nat = self.datetimes.copy()
self.datetimes_with_nat.iloc[-1] = pd.NaT
self.values_some_nan = list(np.tile(self.categories + [np.nan], N))
self.values_all_nan = [np.nan] * len(self.values)
- self.values_all_int8 = np.ones(N, 'int8')
+ self.values_all_int8 = np.ones(N, "int8")
self.categorical = pd.Categorical(self.values, self.categories)
self.series = pd.Series(self.categorical)
@@ -78,62 +77,55 @@ def time_existing_series(self):
class ValueCounts:
params = [True, False]
- param_names = ['dropna']
+ param_names = ["dropna"]
def setup(self, dropna):
- n = 5 * 10**5
- arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
- size=n)]
- self.ts = pd.Series(arr).astype('category')
+ n = 5 * 10 ** 5
+ arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)]
+ self.ts = pd.Series(arr).astype("category")
def time_value_counts(self, dropna):
self.ts.value_counts(dropna=dropna)
class Repr:
-
def setup(self):
- self.sel = pd.Series(['s1234']).astype('category')
+ self.sel = pd.Series(["s1234"]).astype("category")
def time_rendering(self):
str(self.sel)
class SetCategories:
-
def setup(self):
- n = 5 * 10**5
- arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
- size=n)]
- self.ts = pd.Series(arr).astype('category')
+ n = 5 * 10 ** 5
+ arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)]
+ self.ts = pd.Series(arr).astype("category")
def time_set_categories(self):
self.ts.cat.set_categories(self.ts.cat.categories[::2])
class RemoveCategories:
-
def setup(self):
- n = 5 * 10**5
- arr = ['s{:04d}'.format(i) for i in np.random.randint(0, n // 10,
- size=n)]
- self.ts = pd.Series(arr).astype('category')
+ n = 5 * 10 ** 5
+ arr = ["s{:04d}".format(i) for i in np.random.randint(0, n // 10, size=n)]
+ self.ts = pd.Series(arr).astype("category")
def time_remove_categories(self):
self.ts.cat.remove_categories(self.ts.cat.categories[::2])
class Rank:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
ncats = 100
self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str)
- self.s_str_cat = self.s_str.astype('category')
+ self.s_str_cat = self.s_str.astype("category")
self.s_str_cat_ordered = self.s_str_cat.cat.as_ordered()
self.s_int = pd.Series(np.random.randint(0, ncats, size=N))
- self.s_int_cat = self.s_int.astype('category')
+ self.s_int_cat = self.s_int.astype("category")
self.s_int_cat_ordered = self.s_int_cat.cat.as_ordered()
def time_rank_string(self):
@@ -157,28 +149,27 @@ def time_rank_int_cat_ordered(self):
class Isin:
- params = ['object', 'int64']
- param_names = ['dtype']
+ params = ["object", "int64"]
+ param_names = ["dtype"]
def setup(self, dtype):
np.random.seed(1234)
- n = 5 * 10**5
+ n = 5 * 10 ** 5
sample_size = 100
arr = [i for i in np.random.randint(0, n // 10, size=n)]
- if dtype == 'object':
- arr = ['s{:04d}'.format(i) for i in arr]
+ if dtype == "object":
+ arr = ["s{:04d}".format(i) for i in arr]
self.sample = np.random.choice(arr, sample_size)
- self.series = pd.Series(arr).astype('category')
+ self.series = pd.Series(arr).astype("category")
def time_isin_categorical(self, dtype):
self.series.isin(self.sample)
class IsMonotonic:
-
def setup(self):
N = 1000
- self.c = pd.CategoricalIndex(list('a' * N + 'b' * N + 'c' * N))
+ self.c = pd.CategoricalIndex(list("a" * N + "b" * N + "c" * N))
self.s = pd.Series(self.c)
def time_categorical_index_is_monotonic_increasing(self):
@@ -195,9 +186,8 @@ def time_categorical_series_is_monotonic_decreasing(self):
class Contains:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
self.ci = tm.makeCategoricalIndex(N)
self.c = self.ci.values
self.key = self.ci.categories[0]
@@ -211,34 +201,33 @@ def time_categorical_contains(self):
class CategoricalSlicing:
- params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
- param_names = ['index']
+ params = ["monotonic_incr", "monotonic_decr", "non_monotonic"]
+ param_names = ["index"]
def setup(self, index):
- N = 10**6
- categories = ['a', 'b', 'c']
+ N = 10 ** 6
+ categories = ["a", "b", "c"]
values = [0] * N + [1] * N + [2] * N
- if index == 'monotonic_incr':
- self.data = pd.Categorical.from_codes(values,
- categories=categories)
- elif index == 'monotonic_decr':
- self.data = pd.Categorical.from_codes(list(reversed(values)),
- categories=categories)
- elif index == 'non_monotonic':
- self.data = pd.Categorical.from_codes([0, 1, 2] * N,
- categories=categories)
+ if index == "monotonic_incr":
+ self.data = pd.Categorical.from_codes(values, categories=categories)
+ elif index == "monotonic_decr":
+ self.data = pd.Categorical.from_codes(
+ list(reversed(values)), categories=categories
+ )
+ elif index == "non_monotonic":
+ self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories)
else:
- raise ValueError('Invalid index param: {}'.format(index))
+ raise ValueError("Invalid index param: {}".format(index))
self.scalar = 10000
self.list = list(range(10000))
- self.cat_scalar = 'b'
+ self.cat_scalar = "b"
def time_getitem_scalar(self, index):
self.data[self.scalar]
def time_getitem_slice(self, index):
- self.data[:self.scalar]
+ self.data[: self.scalar]
def time_getitem_list_like(self, index):
self.data[[self.scalar]]
@@ -251,9 +240,8 @@ def time_getitem_bool_array(self, index):
class Indexing:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
self.index = pd.CategoricalIndex(range(N), range(N))
self.series = pd.Series(range(N), index=self.index).sort_index()
self.category = self.index[500]
@@ -268,7 +256,7 @@ def time_shallow_copy(self):
self.index._shallow_copy()
def time_align(self):
- pd.DataFrame({'a': self.series, 'b': self.series[:500]})
+ pd.DataFrame({"a": self.series, "b": self.series[:500]})
def time_intersection(self):
self.index[:750].intersection(self.index[250:])
@@ -280,7 +268,7 @@ def time_reindex(self):
self.index.reindex(self.index[:500])
def time_reindex_missing(self):
- self.index.reindex(['a', 'b', 'c', 'd'])
+ self.index.reindex(["a", "b", "c", "d"])
def time_sort_values(self):
self.index.sort_values(ascending=False)
diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py
index 42adede631a01..654075292cdf6 100644
--- a/asv_bench/benchmarks/ctors.py
+++ b/asv_bench/benchmarks/ctors.py
@@ -42,18 +42,22 @@ def list_of_lists_with_none(arr):
class SeriesConstructors:
param_names = ["data_fmt", "with_index", "dtype"]
- params = [[no_change,
- list,
- list_of_str,
- gen_of_str,
- arr_dict,
- list_of_tuples,
- gen_of_tuples,
- list_of_lists,
- list_of_tuples_with_none,
- list_of_lists_with_none],
- [False, True],
- ['float', 'int']]
+ params = [
+ [
+ no_change,
+ list,
+ list_of_str,
+ gen_of_str,
+ arr_dict,
+ list_of_tuples,
+ gen_of_tuples,
+ list_of_lists,
+ list_of_tuples_with_none,
+ list_of_lists_with_none,
+ ],
+ [False, True],
+ ["float", "int"],
+ ]
# Generators get exhausted on use, so run setup before every call
number = 1
@@ -61,10 +65,11 @@ class SeriesConstructors:
def setup(self, data_fmt, with_index, dtype):
if data_fmt in (gen_of_str, gen_of_tuples) and with_index:
- raise NotImplementedError('Series constructors do not support '
- 'using generators with indexes')
- N = 10**4
- if dtype == 'float':
+ raise NotImplementedError(
+ "Series constructors do not support " "using generators with indexes"
+ )
+ N = 10 ** 4
+ if dtype == "float":
arr = np.random.randn(N)
else:
arr = np.arange(N)
@@ -76,13 +81,15 @@ def time_series_constructor(self, data_fmt, with_index, dtype):
class SeriesDtypesConstructors:
-
def setup(self):
- N = 10**4
+ N = 10 ** 4
self.arr = np.random.randn(N)
- self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object)
- self.s = Series([Timestamp('20110101'), Timestamp('20120101'),
- Timestamp('20130101')] * N * 10)
+ self.arr_str = np.array(["foo", "bar", "baz"], dtype=object)
+ self.s = Series(
+ [Timestamp("20110101"), Timestamp("20120101"), Timestamp("20130101")]
+ * N
+ * 10
+ )
def time_index_from_array_string(self):
Index(self.arr_str)
@@ -98,9 +105,8 @@ def time_dtindex_from_index_with_series(self):
class MultiIndexConstructor:
-
def setup(self):
- N = 10**4
+ N = 10 ** 4
self.iterables = [tm.makeStringIndex(N), range(20)]
def time_multiindex_from_iterables(self):
diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py
index 9bfaaa8696009..60800b1f9cae7 100644
--- a/asv_bench/benchmarks/dtypes.py
+++ b/asv_bench/benchmarks/dtypes.py
@@ -2,32 +2,36 @@
import numpy as np
from .pandas_vb_common import (
- numeric_dtypes, datetime_dtypes, string_dtypes, extension_dtypes)
+ numeric_dtypes,
+ datetime_dtypes,
+ string_dtypes,
+ extension_dtypes,
+)
-_numpy_dtypes = [np.dtype(dtype)
- for dtype in (numeric_dtypes +
- datetime_dtypes +
- string_dtypes)]
+_numpy_dtypes = [
+ np.dtype(dtype) for dtype in (numeric_dtypes + datetime_dtypes + string_dtypes)
+]
_dtypes = _numpy_dtypes + extension_dtypes
class Dtypes:
- params = (_dtypes +
- list(map(lambda dt: dt.name, _dtypes)))
- param_names = ['dtype']
+ params = _dtypes + list(map(lambda dt: dt.name, _dtypes))
+ param_names = ["dtype"]
def time_pandas_dtype(self, dtype):
pandas_dtype(dtype)
class DtypesInvalid:
- param_names = ['dtype']
- params = ['scalar-string', 'scalar-int', 'list-string', 'array-string']
- data_dict = {'scalar-string': 'foo',
- 'scalar-int': 1,
- 'list-string': ['foo'] * 1000,
- 'array-string': np.array(['foo'] * 1000)}
+ param_names = ["dtype"]
+ params = ["scalar-string", "scalar-int", "list-string", "array-string"]
+ data_dict = {
+ "scalar-string": "foo",
+ "scalar-int": 1,
+ "list-string": ["foo"] * 1000,
+ "array-string": np.array(["foo"] * 1000),
+ }
def time_pandas_dtype_invalid(self, dtype):
try:
diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py
index be47d35f2cad1..84e94315cc28b 100644
--- a/asv_bench/benchmarks/eval.py
+++ b/asv_bench/benchmarks/eval.py
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
+
try:
import pandas.core.computation.expressions as expr
except ImportError:
@@ -8,8 +9,8 @@
class Eval:
- params = [['numexpr', 'python'], [1, 'all']]
- param_names = ['engine', 'threads']
+ params = [["numexpr", "python"], [1, "all"]]
+ param_names = ["engine", "threads"]
def setup(self, engine, threads):
self.df = pd.DataFrame(np.random.randn(20000, 100))
@@ -21,44 +22,44 @@ def setup(self, engine, threads):
expr.set_numexpr_threads(1)
def time_add(self, engine, threads):
- pd.eval('self.df + self.df2 + self.df3 + self.df4', engine=engine)
+ pd.eval("self.df + self.df2 + self.df3 + self.df4", engine=engine)
def time_and(self, engine, threads):
- pd.eval('(self.df > 0) & (self.df2 > 0) & '
- '(self.df3 > 0) & (self.df4 > 0)', engine=engine)
+ pd.eval(
+ "(self.df > 0) & (self.df2 > 0) & " "(self.df3 > 0) & (self.df4 > 0)",
+ engine=engine,
+ )
def time_chained_cmp(self, engine, threads):
- pd.eval('self.df < self.df2 < self.df3 < self.df4', engine=engine)
+ pd.eval("self.df < self.df2 < self.df3 < self.df4", engine=engine)
def time_mult(self, engine, threads):
- pd.eval('self.df * self.df2 * self.df3 * self.df4', engine=engine)
+ pd.eval("self.df * self.df2 * self.df3 * self.df4", engine=engine)
def teardown(self, engine, threads):
expr.set_numexpr_threads()
class Query:
-
def setup(self):
- N = 10**6
+ N = 10 ** 6
halfway = (N // 2) - 1
- index = pd.date_range('20010101', periods=N, freq='T')
+ index = pd.date_range("20010101", periods=N, freq="T")
s = pd.Series(index)
self.ts = s.iloc[halfway]
- self.df = pd.DataFrame({'a': np.random.randn(N), 'dates': index},
- index=index)
+ self.df = pd.DataFrame({"a": np.random.randn(N), "dates": index}, index=index)
data = np.random.randn(N)
self.min_val = data.min()
self.max_val = data.max()
def time_query_datetime_index(self):
- self.df.query('index < @self.ts')
+ self.df.query("index < @self.ts")
def time_query_datetime_column(self):
- self.df.query('dates < @self.ts')
+ self.df.query("dates < @self.ts")
def time_query_with_boolean_selection(self):
- self.df.query('(a >= @self.min_val) & (a <= @self.max_val)')
+ self.df.query("(a >= @self.min_val) & (a <= @self.max_val)")
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
index 9533938b30fac..acfb26bcf5d7c 100644
--- a/asv_bench/benchmarks/frame_ctor.py
+++ b/asv_bench/benchmarks/frame_ctor.py
@@ -1,25 +1,23 @@
import numpy as np
import pandas.util.testing as tm
from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range
+
try:
from pandas.tseries.offsets import Nano, Hour
except ImportError:
# For compatibility with older versions
- from pandas.core.datetools import * # noqa
+ from pandas.core.datetools import * # noqa
class FromDicts:
-
def setup(self):
N, K = 5000, 50
self.index = tm.makeStringIndex(N)
self.columns = tm.makeStringIndex(K)
- frame = DataFrame(np.random.randn(N, K), index=self.index,
- columns=self.columns)
+ frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns)
self.data = frame.to_dict()
- self.dict_list = frame.to_dict(orient='records')
- self.data2 = {i: {j: float(j) for j in range(100)}
- for i in range(2000)}
+ self.dict_list = frame.to_dict(orient="records")
+ self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)}
def time_list_of_dict(self):
DataFrame(self.dict_list)
@@ -42,7 +40,6 @@ def time_nested_dict_int64(self):
class FromSeries:
-
def setup(self):
mi = MultiIndex.from_product([range(100), range(100)])
self.s = Series(np.random.randn(10000), index=mi)
@@ -54,12 +51,12 @@ def time_mi_series(self):
class FromDictwithTimestamp:
params = [Nano(1), Hour(1)]
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
- N = 10**3
+ N = 10 ** 3
np.random.seed(1234)
- idx = date_range(Timestamp('1/1/1900'), freq=offset, periods=N)
+ idx = date_range(Timestamp("1/1/1900"), freq=offset, periods=N)
df = DataFrame(np.random.randn(N, 10), index=idx)
self.d = df.to_dict()
@@ -70,7 +67,7 @@ def time_dict_with_timestamp_offsets(self, offset):
class FromRecords:
params = [None, 1000]
- param_names = ['nrows']
+ param_names = ["nrows"]
# Generators get exhausted on use, so run setup before every call
number = 1
@@ -86,7 +83,6 @@ def time_frame_from_records_generator(self, nrows):
class FromNDArray:
-
def setup(self):
N = 100000
self.data = np.random.randn(N)
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 5b76eeba115a4..af4741f94d294 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -2,17 +2,15 @@
import numpy as np
-from pandas import (
- DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range)
+from pandas import DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range
import pandas.util.testing as tm
class GetNumericData:
-
def setup(self):
self.df = DataFrame(np.random.randn(10000, 25))
- self.df['foo'] = 'bar'
- self.df['bar'] = 'baz'
+ self.df["foo"] = "bar"
+ self.df["bar"] = "baz"
self.df = self.df._consolidate()
def time_frame_get_numeric_data(self):
@@ -20,17 +18,17 @@ def time_frame_get_numeric_data(self):
class Lookup:
-
def setup(self):
- self.df = DataFrame(np.random.randn(10000, 8),
- columns=list('abcdefgh'))
- self.df['foo'] = 'bar'
+ self.df = DataFrame(np.random.randn(10000, 8), columns=list("abcdefgh"))
+ self.df["foo"] = "bar"
self.row_labels = list(self.df.index[::10])[:900]
self.col_labels = list(self.df.columns) * 100
self.row_labels_all = np.array(
- list(self.df.index) * len(self.df.columns), dtype='object')
+ list(self.df.index) * len(self.df.columns), dtype="object"
+ )
self.col_labels_all = np.array(
- list(self.df.columns) * len(self.df.index), dtype='object')
+ list(self.df.columns) * len(self.df.index), dtype="object"
+ )
def time_frame_fancy_lookup(self):
self.df.lookup(self.row_labels, self.col_labels)
@@ -40,17 +38,21 @@ def time_frame_fancy_lookup_all(self):
class Reindex:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df = DataFrame(np.random.randn(N * 10, N))
self.idx = np.arange(4 * N, 7 * N)
self.df2 = DataFrame(
- {c: {0: np.random.randint(0, 2, N).astype(np.bool_),
- 1: np.random.randint(0, N, N).astype(np.int16),
- 2: np.random.randint(0, N, N).astype(np.int32),
- 3: np.random.randint(0, N, N).astype(np.int64)}
- [np.random.randint(0, 4)] for c in range(N)})
+ {
+ c: {
+ 0: np.random.randint(0, 2, N).astype(np.bool_),
+ 1: np.random.randint(0, N, N).astype(np.int16),
+ 2: np.random.randint(0, N, N).astype(np.int32),
+ 3: np.random.randint(0, N, N).astype(np.int64),
+ }[np.random.randint(0, 4)]
+ for c in range(N)
+ }
+ )
def time_reindex_axis0(self):
self.df.reindex(self.idx)
@@ -66,18 +68,22 @@ def time_reindex_upcast(self):
class Rename:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df = DataFrame(np.random.randn(N * 10, N))
self.idx = np.arange(4 * N, 7 * N)
self.dict_idx = {k: k for k in self.idx}
self.df2 = DataFrame(
- {c: {0: np.random.randint(0, 2, N).astype(np.bool_),
- 1: np.random.randint(0, N, N).astype(np.int16),
- 2: np.random.randint(0, N, N).astype(np.int32),
- 3: np.random.randint(0, N, N).astype(np.int64)}
- [np.random.randint(0, 4)] for c in range(N)})
+ {
+ c: {
+ 0: np.random.randint(0, 2, N).astype(np.bool_),
+ 1: np.random.randint(0, N, N).astype(np.int16),
+ 2: np.random.randint(0, N, N).astype(np.int32),
+ 3: np.random.randint(0, N, N).astype(np.int64),
+ }[np.random.randint(0, 4)]
+ for c in range(N)
+ }
+ )
def time_rename_single(self):
self.df.rename({0: 0})
@@ -103,13 +109,14 @@ def setup(self):
N = 1000
self.df = DataFrame(np.random.randn(N * 10, N))
self.df2 = DataFrame(np.random.randn(N * 50, 10))
- self.df3 = DataFrame(np.random.randn(N, 5 * N),
- columns=['C' + str(c) for c in range(N * 5)])
+ self.df3 = DataFrame(
+ np.random.randn(N, 5 * N), columns=["C" + str(c) for c in range(N * 5)]
+ )
self.df4 = DataFrame(np.random.randn(N * 1000, 10))
def time_iteritems(self):
# (monitor no-copying behaviour)
- if hasattr(self.df, '_item_cache'):
+ if hasattr(self.df, "_item_cache"):
self.df._item_cache.clear()
for name, col in self.df.iteritems():
pass
@@ -192,7 +199,6 @@ def time_iterrows(self):
class ToString:
-
def setup(self):
self.df = DataFrame(np.random.randn(100, 10))
@@ -201,11 +207,10 @@ def time_to_string_floats(self):
class ToHTML:
-
def setup(self):
nrows = 500
self.df2 = DataFrame(np.random.randn(nrows, 10))
- self.df2[0] = period_range('2000', periods=nrows)
+ self.df2[0] = period_range("2000", periods=nrows)
self.df2[1] = range(nrows)
def time_to_html_mixed(self):
@@ -213,7 +218,6 @@ def time_to_html_mixed(self):
class Repr:
-
def setup(self):
nrows = 10000
data = np.random.randn(nrows, 10)
@@ -238,7 +242,6 @@ def time_frame_repr_wide(self):
class MaskBool:
-
def setup(self):
data = np.random.randn(1000, 500)
df = DataFrame(data)
@@ -254,9 +257,8 @@ def time_frame_mask_floats(self):
class Isnull:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.df_no_null = DataFrame(np.random.randn(N, N))
sample = np.array([np.nan, 1.0])
@@ -267,8 +269,20 @@ def setup(self):
data = np.random.choice(sample, (N, N))
self.df_strings = DataFrame(data)
- sample = np.array([NaT, np.nan, None, np.datetime64('NaT'),
- np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd'])
+ sample = np.array(
+ [
+ NaT,
+ np.nan,
+ None,
+ np.datetime64("NaT"),
+ np.timedelta64("NaT"),
+ 0,
+ 1,
+ 2.0,
+ "",
+ "abcd",
+ ]
+ )
data = np.random.choice(sample, (N, N))
self.df_obj = DataFrame(data)
@@ -287,8 +301,8 @@ def time_isnull_obj(self):
class Fillna:
- params = ([True, False], ['pad', 'bfill'])
- param_names = ['inplace', 'method']
+ params = ([True, False], ["pad", "bfill"])
+ param_names = ["inplace", "method"]
def setup(self, inplace, method):
values = np.random.randn(10000, 100)
@@ -301,8 +315,8 @@ def time_frame_fillna(self, inplace, method):
class Dropna:
- params = (['all', 'any'], [0, 1])
- param_names = ['how', 'axis']
+ params = (["all", "any"], [0, 1])
+ param_names = ["how", "axis"]
def setup(self, how, axis):
self.df = DataFrame(np.random.randn(10000, 1000))
@@ -310,7 +324,7 @@ def setup(self, how, axis):
self.df.ix[2000:3000] = np.nan
self.df.ix[:, 60:70] = np.nan
self.df_mixed = self.df.copy()
- self.df_mixed['foo'] = 'bar'
+ self.df_mixed["foo"] = "bar"
def time_dropna(self, how, axis):
self.df.dropna(how=how, axis=axis)
@@ -322,7 +336,7 @@ def time_dropna_axis_mixed_dtypes(self, how, axis):
class Count:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
self.df = DataFrame(np.random.randn(10000, 1000))
@@ -330,15 +344,16 @@ def setup(self, axis):
self.df.ix[2000:3000] = np.nan
self.df.ix[:, 60:70] = np.nan
self.df_mixed = self.df.copy()
- self.df_mixed['foo'] = 'bar'
+ self.df_mixed["foo"] = "bar"
self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index])
- self.df.columns = MultiIndex.from_arrays([self.df.columns,
- self.df.columns])
- self.df_mixed.index = MultiIndex.from_arrays([self.df_mixed.index,
- self.df_mixed.index])
- self.df_mixed.columns = MultiIndex.from_arrays([self.df_mixed.columns,
- self.df_mixed.columns])
+ self.df.columns = MultiIndex.from_arrays([self.df.columns, self.df.columns])
+ self.df_mixed.index = MultiIndex.from_arrays(
+ [self.df_mixed.index, self.df_mixed.index]
+ )
+ self.df_mixed.columns = MultiIndex.from_arrays(
+ [self.df_mixed.columns, self.df_mixed.columns]
+ )
def time_count_level_multi(self, axis):
self.df.count(axis=axis, level=1)
@@ -348,13 +363,12 @@ def time_count_level_mixed_dtypes_multi(self, axis):
class Apply:
-
def setup(self):
self.df = DataFrame(np.random.randn(1000, 100))
self.s = Series(np.arange(1028.0))
self.df2 = DataFrame({i: self.s for i in range(1028)})
- self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC'))
+ self.df3 = DataFrame(np.random.randn(1000, 3), columns=list("ABC"))
def time_apply_user_func(self):
self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)])
@@ -372,11 +386,10 @@ def time_apply_pass_thru(self):
self.df.apply(lambda x: x)
def time_apply_ref_by_name(self):
- self.df3.apply(lambda x: x['A'] + x['B'], axis=1)
+ self.df3.apply(lambda x: x["A"] + x["B"], axis=1)
class Dtypes:
-
def setup(self):
self.df = DataFrame(np.random.randn(1000, 1000))
@@ -385,19 +398,18 @@ def time_frame_dtypes(self):
class Equals:
-
def setup(self):
- N = 10**3
+ N = 10 ** 3
self.float_df = DataFrame(np.random.randn(N, N))
self.float_df_nan = self.float_df.copy()
self.float_df_nan.iloc[-1, -1] = np.nan
- self.object_df = DataFrame('foo', index=range(N), columns=range(N))
+ self.object_df = DataFrame("foo", index=range(N), columns=range(N))
self.object_df_nan = self.object_df.copy()
self.object_df_nan.iloc[-1, -1] = np.nan
self.nonunique_cols = self.object_df.copy()
- self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns)
+ self.nonunique_cols.columns = ["A"] * len(self.nonunique_cols.columns)
self.nonunique_cols_nan = self.nonunique_cols.copy()
self.nonunique_cols_nan.iloc[-1, -1] = np.nan
@@ -422,8 +434,8 @@ def time_frame_object_unequal(self):
class Interpolate:
- params = [None, 'infer']
- param_names = ['downcast']
+ params = [None, "infer"]
+ param_names = ["downcast"]
def setup(self, downcast):
N = 10000
@@ -431,12 +443,16 @@ def setup(self, downcast):
self.df = DataFrame(np.random.randn(N, 100))
self.df.values[::2] = np.nan
- self.df2 = DataFrame({'A': np.arange(0, N),
- 'B': np.random.randint(0, 100, N),
- 'C': np.random.randn(N),
- 'D': np.random.randn(N)})
- self.df2.loc[1::5, 'A'] = np.nan
- self.df2.loc[1::5, 'C'] = np.nan
+ self.df2 = DataFrame(
+ {
+ "A": np.arange(0, N),
+ "B": np.random.randint(0, 100, N),
+ "C": np.random.randn(N),
+ "D": np.random.randn(N),
+ }
+ )
+ self.df2.loc[1::5, "A"] = np.nan
+ self.df2.loc[1::5, "C"] = np.nan
def time_interpolate(self, downcast):
self.df.interpolate(downcast=downcast)
@@ -448,7 +464,7 @@ def time_interpolate_some_good(self, downcast):
class Shift:
# frame shift speedup issue-5609
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
self.df = DataFrame(np.random.rand(10000, 500))
@@ -458,7 +474,6 @@ def time_shift(self, axis):
class Nunique:
-
def setup(self):
self.df = DataFrame(np.random.randn(10000, 1000))
@@ -467,14 +482,17 @@ def time_frame_nunique(self):
class Duplicated:
-
def setup(self):
- n = (1 << 20)
- t = date_range('2015-01-01', freq='S', periods=(n // 64))
+ n = 1 << 20
+ t = date_range("2015-01-01", freq="S", periods=(n // 64))
xs = np.random.randn(n // 64).round(2)
- self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
- 'b': np.random.choice(t, n),
- 'c': np.random.choice(xs, n)})
+ self.df = DataFrame(
+ {
+ "a": np.random.randint(-1 << 8, 1 << 8, n),
+ "b": np.random.choice(t, n),
+ "c": np.random.choice(xs, n),
+ }
+ )
self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
def time_frame_duplicated(self):
@@ -487,10 +505,10 @@ def time_frame_duplicated_wide(self):
class XS:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
- self.N = 10**4
+ self.N = 10 ** 4
self.df = DataFrame(np.random.randn(self.N, self.N))
def time_frame_xs(self, axis):
@@ -500,35 +518,38 @@ def time_frame_xs(self, axis):
class SortValues:
params = [True, False]
- param_names = ['ascending']
+ param_names = ["ascending"]
def setup(self, ascending):
- self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB'))
+ self.df = DataFrame(np.random.randn(1000000, 2), columns=list("AB"))
def time_frame_sort_values(self, ascending):
- self.df.sort_values(by='A', ascending=ascending)
+ self.df.sort_values(by="A", ascending=ascending)
class SortIndexByColumns:
-
def setup(self):
N = 10000
K = 10
- self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K),
- 'key2': tm.makeStringIndex(N).values.repeat(K),
- 'value': np.random.randn(N * K)})
+ self.df = DataFrame(
+ {
+ "key1": tm.makeStringIndex(N).values.repeat(K),
+ "key2": tm.makeStringIndex(N).values.repeat(K),
+ "value": np.random.randn(N * K),
+ }
+ )
def time_frame_sort_values_by_columns(self):
- self.df.sort_values(by=['key1', 'key2'])
+ self.df.sort_values(by=["key1", "key2"])
class Quantile:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
- self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC'))
+ self.df = DataFrame(np.random.randn(1000, 3), columns=list("ABC"))
def time_frame_quantile(self, axis):
self.df.quantile([0.1, 0.5], axis=axis)
@@ -548,37 +569,37 @@ def time_info(self):
class NSort:
- params = ['first', 'last', 'all']
- param_names = ['keep']
+ params = ["first", "last", "all"]
+ param_names = ["keep"]
def setup(self, keep):
- self.df = DataFrame(np.random.randn(100000, 3),
- columns=list('ABC'))
+ self.df = DataFrame(np.random.randn(100000, 3), columns=list("ABC"))
def time_nlargest_one_column(self, keep):
- self.df.nlargest(100, 'A', keep=keep)
+ self.df.nlargest(100, "A", keep=keep)
def time_nlargest_two_columns(self, keep):
- self.df.nlargest(100, ['A', 'B'], keep=keep)
+ self.df.nlargest(100, ["A", "B"], keep=keep)
def time_nsmallest_one_column(self, keep):
- self.df.nsmallest(100, 'A', keep=keep)
+ self.df.nsmallest(100, "A", keep=keep)
def time_nsmallest_two_columns(self, keep):
- self.df.nsmallest(100, ['A', 'B'], keep=keep)
+ self.df.nsmallest(100, ["A", "B"], keep=keep)
class Describe:
-
def setup(self):
- self.df = DataFrame({
- 'a': np.random.randint(0, 100, int(1e6)),
- 'b': np.random.randint(0, 100, int(1e6)),
- 'c': np.random.randint(0, 100, int(1e6))
- })
+ self.df = DataFrame(
+ {
+ "a": np.random.randint(0, 100, int(1e6)),
+ "b": np.random.randint(0, 100, int(1e6)),
+ "c": np.random.randint(0, 100, int(1e6)),
+ }
+ )
def time_series_describe(self):
- self.df['a'].describe()
+ self.df["a"].describe()
def time_dataframe_describe(self):
self.df.describe()
diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py
index 65a03bfda48c5..0d0b75561d057 100644
--- a/asv_bench/benchmarks/gil.py
+++ b/asv_bench/benchmarks/gil.py
@@ -2,9 +2,19 @@
import pandas.util.testing as tm
from pandas import DataFrame, Series, read_csv, factorize, date_range
from pandas.core.algorithms import take_1d
+
try:
- from pandas import (rolling_median, rolling_mean, rolling_min, rolling_max,
- rolling_var, rolling_skew, rolling_kurt, rolling_std)
+ from pandas import (
+ rolling_median,
+ rolling_mean,
+ rolling_min,
+ rolling_max,
+ rolling_var,
+ rolling_skew,
+ rolling_kurt,
+ rolling_std,
+ )
+
have_rolling_methods = True
except ImportError:
have_rolling_methods = False
@@ -14,6 +24,7 @@
from pandas import algos
try:
from pandas.util.testing import test_parallel
+
have_real_test_parallel = True
except ImportError:
have_real_test_parallel = False
@@ -21,32 +32,36 @@
def test_parallel(num_threads=1):
def wrapper(fname):
return fname
+
return wrapper
+
from .pandas_vb_common import BaseIO
class ParallelGroupbyMethods:
- params = ([2, 4, 8], ['count', 'last', 'max', 'mean', 'min', 'prod',
- 'sum', 'var'])
- param_names = ['threads', 'method']
+ params = ([2, 4, 8], ["count", "last", "max", "mean", "min", "prod", "sum", "var"])
+ param_names = ["threads", "method"]
def setup(self, threads, method):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**6
- ngroups = 10**3
- df = DataFrame({'key': np.random.randint(0, ngroups, size=N),
- 'data': np.random.randn(N)})
+ N = 10 ** 6
+ ngroups = 10 ** 3
+ df = DataFrame(
+ {"key": np.random.randint(0, ngroups, size=N), "data": np.random.randn(N)}
+ )
@test_parallel(num_threads=threads)
def parallel():
- getattr(df.groupby('key')['data'], method)()
+ getattr(df.groupby("key")["data"], method)()
+
self.parallel = parallel
def loop():
- getattr(df.groupby('key')['data'], method)()
+ getattr(df.groupby("key")["data"], method)()
+
self.loop = loop
def time_parallel(self, threads, method):
@@ -60,18 +75,19 @@ def time_loop(self, threads, method):
class ParallelGroups:
params = [2, 4, 8]
- param_names = ['threads']
+ param_names = ["threads"]
def setup(self, threads):
if not have_real_test_parallel:
raise NotImplementedError
- size = 2**22
- ngroups = 10**3
+ size = 2 ** 22
+ ngroups = 10 ** 3
data = Series(np.random.randint(0, ngroups, size=size))
@test_parallel(num_threads=threads)
def get_groups():
data.groupby(data).groups
+
self.get_groups = get_groups
def time_get_groups(self, threads):
@@ -80,19 +96,20 @@ def time_get_groups(self, threads):
class ParallelTake1D:
- params = ['int64', 'float64']
- param_names = ['dtype']
+ params = ["int64", "float64"]
+ param_names = ["dtype"]
def setup(self, dtype):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**6
- df = DataFrame({'col': np.arange(N, dtype=dtype)})
+ N = 10 ** 6
+ df = DataFrame({"col": np.arange(N, dtype=dtype)})
indexer = np.arange(100, len(df) - 100)
@test_parallel(num_threads=2)
def parallel_take1d():
- take_1d(df['col'].values, indexer)
+ take_1d(df["col"].values, indexer)
+
self.parallel_take1d = parallel_take1d
def time_take1d(self, dtype):
@@ -107,14 +124,14 @@ class ParallelKth:
def setup(self):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**7
- k = 5 * 10**5
- kwargs_list = [{'arr': np.random.randn(N)},
- {'arr': np.random.randn(N)}]
+ N = 10 ** 7
+ k = 5 * 10 ** 5
+ kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}]
@test_parallel(num_threads=2, kwargs_list=kwargs_list)
def parallel_kth_smallest(arr):
algos.kth_smallest(arr, k)
+
self.parallel_kth_smallest = parallel_kth_smallest
def time_kth_smallest(self):
@@ -122,81 +139,90 @@ def time_kth_smallest(self):
class ParallelDatetimeFields:
-
def setup(self):
if not have_real_test_parallel:
raise NotImplementedError
- N = 10**6
- self.dti = date_range('1900-01-01', periods=N, freq='T')
- self.period = self.dti.to_period('D')
+ N = 10 ** 6
+ self.dti = date_range("1900-01-01", periods=N, freq="T")
+ self.period = self.dti.to_period("D")
def time_datetime_field_year(self):
@test_parallel(num_threads=2)
def run(dti):
dti.year
+
run(self.dti)
def time_datetime_field_day(self):
@test_parallel(num_threads=2)
def run(dti):
dti.day
+
run(self.dti)
def time_datetime_field_daysinmonth(self):
@test_parallel(num_threads=2)
def run(dti):
dti.days_in_month
+
run(self.dti)
def time_datetime_field_normalize(self):
@test_parallel(num_threads=2)
def run(dti):
dti.normalize()
+
run(self.dti)
def time_datetime_to_period(self):
@test_parallel(num_threads=2)
def run(dti):
- dti.to_period('S')
+ dti.to_period("S")
+
run(self.dti)
def time_period_to_datetime(self):
@test_parallel(num_threads=2)
def run(period):
period.to_timestamp()
+
run(self.period)
class ParallelRolling:
- params = ['median', 'mean', 'min', 'max', 'var', 'skew', 'kurt', 'std']
- param_names = ['method']
+ params = ["median", "mean", "min", "max", "var", "skew", "kurt", "std"]
+ param_names = ["method"]
def setup(self, method):
if not have_real_test_parallel:
raise NotImplementedError
win = 100
arr = np.random.rand(100000)
- if hasattr(DataFrame, 'rolling'):
+ if hasattr(DataFrame, "rolling"):
df = DataFrame(arr).rolling(win)
@test_parallel(num_threads=2)
def parallel_rolling():
getattr(df, method)()
+
self.parallel_rolling = parallel_rolling
elif have_rolling_methods:
- rolling = {'median': rolling_median,
- 'mean': rolling_mean,
- 'min': rolling_min,
- 'max': rolling_max,
- 'var': rolling_var,
- 'skew': rolling_skew,
- 'kurt': rolling_kurt,
- 'std': rolling_std}
+ rolling = {
+ "median": rolling_median,
+ "mean": rolling_mean,
+ "min": rolling_min,
+ "max": rolling_max,
+ "var": rolling_var,
+ "skew": rolling_skew,
+ "kurt": rolling_kurt,
+ "std": rolling_std,
+ }
@test_parallel(num_threads=2)
def parallel_rolling():
rolling[method](arr, win)
+
self.parallel_rolling = parallel_rolling
else:
raise NotImplementedError
@@ -209,30 +235,34 @@ class ParallelReadCSV(BaseIO):
number = 1
repeat = 5
- params = ['float', 'object', 'datetime']
- param_names = ['dtype']
+ params = ["float", "object", "datetime"]
+ param_names = ["dtype"]
def setup(self, dtype):
if not have_real_test_parallel:
raise NotImplementedError
rows = 10000
cols = 50
- data = {'float': DataFrame(np.random.randn(rows, cols)),
- 'datetime': DataFrame(np.random.randn(rows, cols),
- index=date_range('1/1/2000',
- periods=rows)),
- 'object': DataFrame('foo',
- index=range(rows),
- columns=['object%03d'.format(i)
- for i in range(5)])}
-
- self.fname = '__test_{}__.csv'.format(dtype)
+ data = {
+ "float": DataFrame(np.random.randn(rows, cols)),
+ "datetime": DataFrame(
+ np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows)
+ ),
+ "object": DataFrame(
+ "foo",
+ index=range(rows),
+ columns=["object%03d".format(i) for i in range(5)],
+ ),
+ }
+
+ self.fname = "__test_{}__.csv".format(dtype)
df = data[dtype]
df.to_csv(self.fname)
@test_parallel(num_threads=2)
def parallel_read_csv():
read_csv(self.fname)
+
self.parallel_read_csv = parallel_read_csv
def time_read_csv(self, dtype):
@@ -244,7 +274,7 @@ class ParallelFactorize:
number = 1
repeat = 5
params = [2, 4, 8]
- param_names = ['threads']
+ param_names = ["threads"]
def setup(self, threads):
if not have_real_test_parallel:
@@ -255,10 +285,12 @@ def setup(self, threads):
@test_parallel(num_threads=threads)
def parallel():
factorize(strings)
+
self.parallel = parallel
def loop():
factorize(strings)
+
self.loop = loop
def time_parallel(self, threads):
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index 3097ada6d2022..39b07d4734399 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -5,18 +5,55 @@
import numpy as np
from pandas import (
- Categorical, DataFrame, MultiIndex, Series, Timestamp,
- date_range, period_range)
+ Categorical,
+ DataFrame,
+ MultiIndex,
+ Series,
+ Timestamp,
+ date_range,
+ period_range,
+)
import pandas.util.testing as tm
method_blacklist = {
- 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
- 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
- 'var', 'mad', 'describe', 'std', 'quantile'},
- 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
- 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
- 'std'}
+ "object": {
+ "median",
+ "prod",
+ "sem",
+ "cumsum",
+ "sum",
+ "cummin",
+ "mean",
+ "max",
+ "skew",
+ "cumprod",
+ "cummax",
+ "rank",
+ "pct_change",
+ "min",
+ "var",
+ "mad",
+ "describe",
+ "std",
+ "quantile",
+ },
+ "datetime": {
+ "median",
+ "prod",
+ "sem",
+ "cumsum",
+ "sum",
+ "mean",
+ "skew",
+ "cumprod",
+ "cummax",
+ "pct_change",
+ "var",
+ "mad",
+ "describe",
+ "std",
+ },
}
@@ -26,28 +63,31 @@ def setup(self):
self.data = Series(np.random.randn(len(self.labels)))
def time_groupby_apply_dict_return(self):
- self.data.groupby(self.labels).apply(lambda x: {'first': x.values[0],
- 'last': x.values[-1]})
+ self.data.groupby(self.labels).apply(
+ lambda x: {"first": x.values[0], "last": x.values[-1]}
+ )
class Apply:
-
def setup_cache(self):
- N = 10**4
+ N = 10 ** 4
labels = np.random.randint(0, 2000, size=N)
labels2 = np.random.randint(0, 3, size=N)
- df = DataFrame({'key': labels,
- 'key2': labels2,
- 'value1': np.random.randn(N),
- 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)
- })
+ df = DataFrame(
+ {
+ "key": labels,
+ "key2": labels2,
+ "value1": np.random.randn(N),
+ "value2": ["foo", "bar", "baz", "qux"] * (N // 4),
+ }
+ )
return df
def time_scalar_function_multi_col(self, df):
- df.groupby(['key', 'key2']).apply(lambda x: 1)
+ df.groupby(["key", "key2"]).apply(lambda x: 1)
def time_scalar_function_single_col(self, df):
- df.groupby('key').apply(lambda x: 1)
+ df.groupby("key").apply(lambda x: 1)
@staticmethod
def df_copy_function(g):
@@ -56,27 +96,29 @@ def df_copy_function(g):
return g.copy()
def time_copy_function_multi_col(self, df):
- df.groupby(['key', 'key2']).apply(self.df_copy_function)
+ df.groupby(["key", "key2"]).apply(self.df_copy_function)
def time_copy_overhead_single_col(self, df):
- df.groupby('key').apply(self.df_copy_function)
+ df.groupby("key").apply(self.df_copy_function)
class Groups:
- param_names = ['key']
- params = ['int64_small', 'int64_large', 'object_small', 'object_large']
+ param_names = ["key"]
+ params = ["int64_small", "int64_large", "object_small", "object_large"]
def setup_cache(self):
- size = 10**6
- data = {'int64_small': Series(np.random.randint(0, 100, size=size)),
- 'int64_large': Series(np.random.randint(0, 10000, size=size)),
- 'object_small': Series(
- tm.makeStringIndex(100).take(
- np.random.randint(0, 100, size=size))),
- 'object_large': Series(
- tm.makeStringIndex(10000).take(
- np.random.randint(0, 10000, size=size)))}
+ size = 10 ** 6
+ data = {
+ "int64_small": Series(np.random.randint(0, 100, size=size)),
+ "int64_large": Series(np.random.randint(0, 10000, size=size)),
+ "object_small": Series(
+ tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))
+ ),
+ "object_large": Series(
+ tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))
+ ),
+ }
return data
def setup(self, data, key):
@@ -89,7 +131,7 @@ def time_series_groups(self, data, key):
class GroupManyLabels:
params = [1, 1000]
- param_names = ['ncols']
+ param_names = ["ncols"]
def setup(self, ncols):
N = 1000
@@ -103,46 +145,45 @@ def time_sum(self, ncols):
class Nth:
- param_names = ['dtype']
- params = ['float32', 'float64', 'datetime', 'object']
+ param_names = ["dtype"]
+ params = ["float32", "float64", "datetime", "object"]
def setup(self, dtype):
- N = 10**5
+ N = 10 ** 5
# with datetimes (GH7555)
- if dtype == 'datetime':
- values = date_range('1/1/2011', periods=N, freq='s')
- elif dtype == 'object':
- values = ['foo'] * N
+ if dtype == "datetime":
+ values = date_range("1/1/2011", periods=N, freq="s")
+ elif dtype == "object":
+ values = ["foo"] * N
else:
values = np.arange(N).astype(dtype)
key = np.arange(N)
- self.df = DataFrame({'key': key, 'values': values})
+ self.df = DataFrame({"key": key, "values": values})
self.df.iloc[1, 1] = np.nan # insert missing data
def time_frame_nth_any(self, dtype):
- self.df.groupby('key').nth(0, dropna='any')
+ self.df.groupby("key").nth(0, dropna="any")
def time_groupby_nth_all(self, dtype):
- self.df.groupby('key').nth(0, dropna='all')
+ self.df.groupby("key").nth(0, dropna="all")
def time_frame_nth(self, dtype):
- self.df.groupby('key').nth(0)
+ self.df.groupby("key").nth(0)
def time_series_nth_any(self, dtype):
- self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
+ self.df["values"].groupby(self.df["key"]).nth(0, dropna="any")
def time_series_nth_all(self, dtype):
- self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
+ self.df["values"].groupby(self.df["key"]).nth(0, dropna="all")
def time_series_nth(self, dtype):
- self.df['values'].groupby(self.df['key']).nth(0)
+ self.df["values"].groupby(self.df["key"]).nth(0)
class DateAttributes:
-
def setup(self):
- rng = date_range('1/1/2000', '12/31/2005', freq='H')
+ rng = date_range("1/1/2000", "12/31/2005", freq="H")
self.year, self.month, self.day = rng.year, rng.month, rng.day
self.ts = Series(np.random.randn(len(rng)), index=rng)
@@ -151,154 +192,167 @@ def time_len_groupby_object(self):
class Int64:
-
def setup(self):
arr = np.random.randint(-1 << 12, 1 << 12, (1 << 17, 5))
i = np.random.choice(len(arr), len(arr) * 5)
arr = np.vstack((arr, arr[i]))
i = np.random.permutation(len(arr))
arr = arr[i]
- self.cols = list('abcde')
+ self.cols = list("abcde")
self.df = DataFrame(arr, columns=self.cols)
- self.df['jim'], self.df['joe'] = np.random.randn(2, len(self.df)) * 10
+ self.df["jim"], self.df["joe"] = np.random.randn(2, len(self.df)) * 10
def time_overflow(self):
self.df.groupby(self.cols).max()
class CountMultiDtype:
-
def setup_cache(self):
n = 10000
- offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
- dates = np.datetime64('now') + offsets
- dates[np.random.rand(n) > 0.5] = np.datetime64('nat')
- offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat')
+ offsets = np.random.randint(n, size=n).astype("timedelta64[ns]")
+ dates = np.datetime64("now") + offsets
+ dates[np.random.rand(n) > 0.5] = np.datetime64("nat")
+ offsets[np.random.rand(n) > 0.5] = np.timedelta64("nat")
value2 = np.random.randn(n)
value2[np.random.rand(n) > 0.5] = np.nan
- obj = np.random.choice(list('ab'), size=n).astype(object)
+ obj = np.random.choice(list("ab"), size=n).astype(object)
obj[np.random.randn(n) > 0.5] = np.nan
- df = DataFrame({'key1': np.random.randint(0, 500, size=n),
- 'key2': np.random.randint(0, 100, size=n),
- 'dates': dates,
- 'value2': value2,
- 'value3': np.random.randn(n),
- 'ints': np.random.randint(0, 1000, size=n),
- 'obj': obj,
- 'offsets': offsets})
+ df = DataFrame(
+ {
+ "key1": np.random.randint(0, 500, size=n),
+ "key2": np.random.randint(0, 100, size=n),
+ "dates": dates,
+ "value2": value2,
+ "value3": np.random.randn(n),
+ "ints": np.random.randint(0, 1000, size=n),
+ "obj": obj,
+ "offsets": offsets,
+ }
+ )
return df
def time_multi_count(self, df):
- df.groupby(['key1', 'key2']).count()
+ df.groupby(["key1", "key2"]).count()
class CountMultiInt:
-
def setup_cache(self):
n = 10000
- df = DataFrame({'key1': np.random.randint(0, 500, size=n),
- 'key2': np.random.randint(0, 100, size=n),
- 'ints': np.random.randint(0, 1000, size=n),
- 'ints2': np.random.randint(0, 1000, size=n)})
+ df = DataFrame(
+ {
+ "key1": np.random.randint(0, 500, size=n),
+ "key2": np.random.randint(0, 100, size=n),
+ "ints": np.random.randint(0, 1000, size=n),
+ "ints2": np.random.randint(0, 1000, size=n),
+ }
+ )
return df
def time_multi_int_count(self, df):
- df.groupby(['key1', 'key2']).count()
+ df.groupby(["key1", "key2"]).count()
def time_multi_int_nunique(self, df):
- df.groupby(['key1', 'key2']).nunique()
+ df.groupby(["key1", "key2"]).nunique()
class AggFunctions:
-
def setup_cache(self):
- N = 10**5
- fac1 = np.array(['A', 'B', 'C'], dtype='O')
- fac2 = np.array(['one', 'two'], dtype='O')
- df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=N)),
- 'key2': fac2.take(np.random.randint(0, 2, size=N)),
- 'value1': np.random.randn(N),
- 'value2': np.random.randn(N),
- 'value3': np.random.randn(N)})
+ N = 10 ** 5
+ fac1 = np.array(["A", "B", "C"], dtype="O")
+ fac2 = np.array(["one", "two"], dtype="O")
+ df = DataFrame(
+ {
+ "key1": fac1.take(np.random.randint(0, 3, size=N)),
+ "key2": fac2.take(np.random.randint(0, 2, size=N)),
+ "value1": np.random.randn(N),
+ "value2": np.random.randn(N),
+ "value3": np.random.randn(N),
+ }
+ )
return df
def time_different_str_functions(self, df):
- df.groupby(['key1', 'key2']).agg({'value1': 'mean',
- 'value2': 'var',
- 'value3': 'sum'})
+ df.groupby(["key1", "key2"]).agg(
+ {"value1": "mean", "value2": "var", "value3": "sum"}
+ )
def time_different_numpy_functions(self, df):
- df.groupby(['key1', 'key2']).agg({'value1': np.mean,
- 'value2': np.var,
- 'value3': np.sum})
+ df.groupby(["key1", "key2"]).agg(
+ {"value1": np.mean, "value2": np.var, "value3": np.sum}
+ )
def time_different_python_functions_multicol(self, df):
- df.groupby(['key1', 'key2']).agg([sum, min, max])
+ df.groupby(["key1", "key2"]).agg([sum, min, max])
def time_different_python_functions_singlecol(self, df):
- df.groupby('key1').agg([sum, min, max])
+ df.groupby("key1").agg([sum, min, max])
class GroupStrings:
-
def setup(self):
- n = 2 * 10**5
- alpha = list(map(''.join, product(ascii_letters, repeat=4)))
+ n = 2 * 10 ** 5
+ alpha = list(map("".join, product(ascii_letters, repeat=4)))
data = np.random.choice(alpha, (n // 5, 4), replace=False)
data = np.repeat(data, 5, axis=0)
- self.df = DataFrame(data, columns=list('abcd'))
- self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3)
+ self.df = DataFrame(data, columns=list("abcd"))
+ self.df["joe"] = (np.random.randn(len(self.df)) * 10).round(3)
self.df = self.df.sample(frac=1).reset_index(drop=True)
def time_multi_columns(self):
- self.df.groupby(list('abcd')).max()
+ self.df.groupby(list("abcd")).max()
class MultiColumn:
-
def setup_cache(self):
- N = 10**5
+ N = 10 ** 5
key1 = np.tile(np.arange(100, dtype=object), 1000)
key2 = key1.copy()
np.random.shuffle(key1)
np.random.shuffle(key2)
- df = DataFrame({'key1': key1,
- 'key2': key2,
- 'data1': np.random.randn(N),
- 'data2': np.random.randn(N)})
+ df = DataFrame(
+ {
+ "key1": key1,
+ "key2": key2,
+ "data1": np.random.randn(N),
+ "data2": np.random.randn(N),
+ }
+ )
return df
def time_lambda_sum(self, df):
- df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())
+ df.groupby(["key1", "key2"]).agg(lambda x: x.values.sum())
def time_cython_sum(self, df):
- df.groupby(['key1', 'key2']).sum()
+ df.groupby(["key1", "key2"]).sum()
def time_col_select_lambda_sum(self, df):
- df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())
+ df.groupby(["key1", "key2"])["data1"].agg(lambda x: x.values.sum())
def time_col_select_numpy_sum(self, df):
- df.groupby(['key1', 'key2'])['data1'].agg(np.sum)
+ df.groupby(["key1", "key2"])["data1"].agg(np.sum)
class Size:
-
def setup(self):
- n = 10**5
- offsets = np.random.randint(n, size=n).astype('timedelta64[ns]')
- dates = np.datetime64('now') + offsets
- self.df = DataFrame({'key1': np.random.randint(0, 500, size=n),
- 'key2': np.random.randint(0, 100, size=n),
- 'value1': np.random.randn(n),
- 'value2': np.random.randn(n),
- 'value3': np.random.randn(n),
- 'dates': dates})
+ n = 10 ** 5
+ offsets = np.random.randint(n, size=n).astype("timedelta64[ns]")
+ dates = np.datetime64("now") + offsets
+ self.df = DataFrame(
+ {
+ "key1": np.random.randint(0, 500, size=n),
+ "key2": np.random.randint(0, 100, size=n),
+ "value1": np.random.randn(n),
+ "value2": np.random.randn(n),
+ "value3": np.random.randn(n),
+ "dates": dates,
+ }
+ )
self.draws = Series(np.random.randn(n))
- labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4))
- self.cats = labels.astype('category')
+ labels = Series(["foo", "bar", "baz", "qux"] * (n // 4))
+ self.cats = labels.astype("category")
def time_multi_size(self):
- self.df.groupby(['key1', 'key2']).size()
+ self.df.groupby(["key1", "key2"]).size()
def time_category_size(self):
self.draws.groupby(self.cats).size()
@@ -306,15 +360,47 @@ def time_category_size(self):
class GroupByMethods:
- param_names = ['dtype', 'method', 'application']
- params = [['int', 'float', 'object', 'datetime'],
- ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
- 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
- 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
- 'pct_change', 'prod', 'quantile', 'rank', 'sem', 'shift',
- 'size', 'skew', 'std', 'sum', 'tail', 'unique', 'value_counts',
- 'var'],
- ['direct', 'transformation']]
+ param_names = ["dtype", "method", "application"]
+ params = [
+ ["int", "float", "object", "datetime"],
+ [
+ "all",
+ "any",
+ "bfill",
+ "count",
+ "cumcount",
+ "cummax",
+ "cummin",
+ "cumprod",
+ "cumsum",
+ "describe",
+ "ffill",
+ "first",
+ "head",
+ "last",
+ "mad",
+ "max",
+ "min",
+ "median",
+ "mean",
+ "nunique",
+ "pct_change",
+ "prod",
+ "quantile",
+ "rank",
+ "sem",
+ "shift",
+ "size",
+ "skew",
+ "std",
+ "sum",
+ "tail",
+ "unique",
+ "value_counts",
+ "var",
+ ],
+ ["direct", "transformation"],
+ ]
def setup(self, dtype, method, application):
if method in method_blacklist.get(dtype, {}):
@@ -323,29 +409,28 @@ def setup(self, dtype, method, application):
size = ngroups * 2
rng = np.arange(ngroups)
values = rng.take(np.random.randint(0, ngroups, size=size))
- if dtype == 'int':
+ if dtype == "int":
key = np.random.randint(0, size, size=size)
- elif dtype == 'float':
- key = np.concatenate([np.random.random(ngroups) * 0.1,
- np.random.random(ngroups) * 10.0])
- elif dtype == 'object':
- key = ['foo'] * size
- elif dtype == 'datetime':
- key = date_range('1/1/2011', periods=size, freq='s')
-
- df = DataFrame({'values': values, 'key': key})
-
- if application == 'transform':
- if method == 'describe':
+ elif dtype == "float":
+ key = np.concatenate(
+ [np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0]
+ )
+ elif dtype == "object":
+ key = ["foo"] * size
+ elif dtype == "datetime":
+ key = date_range("1/1/2011", periods=size, freq="s")
+
+ df = DataFrame({"values": values, "key": key})
+
+ if application == "transform":
+ if method == "describe":
raise NotImplementedError
- self.as_group_method = lambda: df.groupby(
- 'key')['values'].transform(method)
- self.as_field_method = lambda: df.groupby(
- 'values')['key'].transform(method)
+ self.as_group_method = lambda: df.groupby("key")["values"].transform(method)
+ self.as_field_method = lambda: df.groupby("values")["key"].transform(method)
else:
- self.as_group_method = getattr(df.groupby('key')['values'], method)
- self.as_field_method = getattr(df.groupby('values')['key'], method)
+ self.as_group_method = getattr(df.groupby("key")["values"], method)
+ self.as_field_method = getattr(df.groupby("values")["key"], method)
def time_dtype_as_group(self, dtype, method, application):
self.as_group_method()
@@ -356,20 +441,22 @@ def time_dtype_as_field(self, dtype, method, application):
class RankWithTies:
# GH 21237
- param_names = ['dtype', 'tie_method']
- params = [['float64', 'float32', 'int64', 'datetime64'],
- ['first', 'average', 'dense', 'min', 'max']]
+ param_names = ["dtype", "tie_method"]
+ params = [
+ ["float64", "float32", "int64", "datetime64"],
+ ["first", "average", "dense", "min", "max"],
+ ]
def setup(self, dtype, tie_method):
- N = 10**4
- if dtype == 'datetime64':
+ N = 10 ** 4
+ if dtype == "datetime64":
data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype)
else:
data = np.array([1] * N, dtype=dtype)
- self.df = DataFrame({'values': data, 'key': ['foo'] * N})
+ self.df = DataFrame({"values": data, "key": ["foo"] * N})
def time_rank_ties(self, dtype, tie_method):
- self.df.groupby('key').rank(method=tie_method)
+ self.df.groupby("key").rank(method=tie_method)
class Float32:
@@ -382,57 +469,61 @@ def setup(self):
self.df = DataFrame(dict(a=arr, b=arr))
def time_sum(self):
- self.df.groupby(['a'])['b'].sum()
+ self.df.groupby(["a"])["b"].sum()
class Categories:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
arr = np.random.random(N)
- data = {'a': Categorical(np.random.randint(10000, size=N)),
- 'b': arr}
+ data = {"a": Categorical(np.random.randint(10000, size=N)), "b": arr}
self.df = DataFrame(data)
- data = {'a': Categorical(np.random.randint(10000, size=N),
- ordered=True),
- 'b': arr}
+ data = {
+ "a": Categorical(np.random.randint(10000, size=N), ordered=True),
+ "b": arr,
+ }
self.df_ordered = DataFrame(data)
- data = {'a': Categorical(np.random.randint(100, size=N),
- categories=np.arange(10000)),
- 'b': arr}
+ data = {
+ "a": Categorical(
+ np.random.randint(100, size=N), categories=np.arange(10000)
+ ),
+ "b": arr,
+ }
self.df_extra_cat = DataFrame(data)
def time_groupby_sort(self):
- self.df.groupby('a')['b'].count()
+ self.df.groupby("a")["b"].count()
def time_groupby_nosort(self):
- self.df.groupby('a', sort=False)['b'].count()
+ self.df.groupby("a", sort=False)["b"].count()
def time_groupby_ordered_sort(self):
- self.df_ordered.groupby('a')['b'].count()
+ self.df_ordered.groupby("a")["b"].count()
def time_groupby_ordered_nosort(self):
- self.df_ordered.groupby('a', sort=False)['b'].count()
+ self.df_ordered.groupby("a", sort=False)["b"].count()
def time_groupby_extra_cat_sort(self):
- self.df_extra_cat.groupby('a')['b'].count()
+ self.df_extra_cat.groupby("a")["b"].count()
def time_groupby_extra_cat_nosort(self):
- self.df_extra_cat.groupby('a', sort=False)['b'].count()
+ self.df_extra_cat.groupby("a", sort=False)["b"].count()
class Datelike:
# GH 14338
- params = ['period_range', 'date_range', 'date_range_tz']
- param_names = ['grouper']
+ params = ["period_range", "date_range", "date_range_tz"]
+ param_names = ["grouper"]
def setup(self, grouper):
- N = 10**4
- rng_map = {'period_range': period_range,
- 'date_range': date_range,
- 'date_range_tz': partial(date_range, tz='US/Central')}
- self.grouper = rng_map[grouper]('1900-01-01', freq='D', periods=N)
- self.df = DataFrame(np.random.randn(10**4, 2))
+ N = 10 ** 4
+ rng_map = {
+ "period_range": period_range,
+ "date_range": date_range,
+ "date_range_tz": partial(date_range, tz="US/Central"),
+ }
+ self.grouper = rng_map[grouper]("1900-01-01", freq="D", periods=N)
+ self.df = DataFrame(np.random.randn(10 ** 4, 2))
def time_sum(self, grouper):
self.df.groupby(self.grouper).sum()
@@ -442,11 +533,10 @@ class SumBools:
# GH 2692
def setup(self):
N = 500
- self.df = DataFrame({'ii': range(N),
- 'bb': [True] * N})
+ self.df = DataFrame({"ii": range(N), "bb": [True] * N})
def time_groupby_sum_booleans(self):
- self.df.groupby('ii').sum()
+ self.df.groupby("ii").sum()
class SumMultiLevel:
@@ -455,84 +545,85 @@ class SumMultiLevel:
def setup(self):
N = 50
- self.df = DataFrame({'A': list(range(N)) * 2,
- 'B': range(N * 2),
- 'C': 1}).set_index(['A', 'B'])
+ self.df = DataFrame(
+ {"A": list(range(N)) * 2, "B": range(N * 2), "C": 1}
+ ).set_index(["A", "B"])
def time_groupby_sum_multiindex(self):
self.df.groupby(level=[0, 1]).sum()
class Transform:
-
def setup(self):
n1 = 400
n2 = 250
- index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)],
- codes=[np.repeat(range(n1), n2).tolist(),
- list(range(n2)) * n1],
- names=['lev1', 'lev2'])
+ index = MultiIndex(
+ levels=[np.arange(n1), tm.makeStringIndex(n2)],
+ codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1],
+ names=["lev1", "lev2"],
+ )
arr = np.random.randn(n1 * n2, 3)
arr[::10000, 0] = np.nan
arr[1::10000, 1] = np.nan
arr[2::10000, 2] = np.nan
- data = DataFrame(arr, index=index, columns=['col1', 'col20', 'col3'])
+ data = DataFrame(arr, index=index, columns=["col1", "col20", "col3"])
self.df = data
n = 20000
- self.df1 = DataFrame(np.random.randint(1, n, (n, 3)),
- columns=['jim', 'joe', 'jolie'])
+ self.df1 = DataFrame(
+ np.random.randint(1, n, (n, 3)), columns=["jim", "joe", "jolie"]
+ )
self.df2 = self.df1.copy()
- self.df2['jim'] = self.df2['joe']
+ self.df2["jim"] = self.df2["joe"]
- self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)),
- columns=['jim', 'joe', 'jolie'])
+ self.df3 = DataFrame(
+ np.random.randint(1, (n / 10), (n, 3)), columns=["jim", "joe", "jolie"]
+ )
self.df4 = self.df3.copy()
- self.df4['jim'] = self.df4['joe']
+ self.df4["jim"] = self.df4["joe"]
def time_transform_lambda_max(self):
- self.df.groupby(level='lev1').transform(lambda x: max(x))
+ self.df.groupby(level="lev1").transform(lambda x: max(x))
def time_transform_ufunc_max(self):
- self.df.groupby(level='lev1').transform(np.max)
+ self.df.groupby(level="lev1").transform(np.max)
def time_transform_multi_key1(self):
- self.df1.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df1.groupby(["jim", "joe"])["jolie"].transform("max")
def time_transform_multi_key2(self):
- self.df2.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df2.groupby(["jim", "joe"])["jolie"].transform("max")
def time_transform_multi_key3(self):
- self.df3.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df3.groupby(["jim", "joe"])["jolie"].transform("max")
def time_transform_multi_key4(self):
- self.df4.groupby(['jim', 'joe'])['jolie'].transform('max')
+ self.df4.groupby(["jim", "joe"])["jolie"].transform("max")
class TransformBools:
-
def setup(self):
N = 120000
transition_points = np.sort(np.random.choice(np.arange(N), 1400))
transitions = np.zeros(N, dtype=np.bool)
transitions[transition_points] = True
self.g = transitions.cumsum()
- self.df = DataFrame({'signal': np.random.rand(N)})
+ self.df = DataFrame({"signal": np.random.rand(N)})
def time_transform_mean(self):
- self.df['signal'].groupby(self.g).transform(np.mean)
+ self.df["signal"].groupby(self.g).transform(np.mean)
class TransformNaN:
# GH 12737
def setup(self):
- self.df_nans = DataFrame({'key': np.repeat(np.arange(1000), 10),
- 'B': np.nan,
- 'C': np.nan})
- self.df_nans.loc[4::10, 'B':'C'] = 5
+ self.df_nans = DataFrame(
+ {"key": np.repeat(np.arange(1000), 10), "B": np.nan, "C": np.nan}
+ )
+ self.df_nans.loc[4::10, "B":"C"] = 5
def time_first(self):
- self.df_nans.groupby('key').transform('first')
+ self.df_nans.groupby("key").transform("first")
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
index 1eedc1a2b3021..6541ddcb0397d 100644
--- a/asv_bench/benchmarks/index_object.py
+++ b/asv_bench/benchmarks/index_object.py
@@ -1,38 +1,47 @@
import numpy as np
import pandas.util.testing as tm
-from pandas import (Series, date_range, DatetimeIndex, Index, RangeIndex,
- Float64Index, IntervalIndex)
+from pandas import (
+ Series,
+ date_range,
+ DatetimeIndex,
+ Index,
+ RangeIndex,
+ Float64Index,
+ IntervalIndex,
+)
class SetOperations:
- params = (['datetime', 'date_string', 'int', 'strings'],
- ['intersection', 'union', 'symmetric_difference'])
- param_names = ['dtype', 'method']
+ params = (
+ ["datetime", "date_string", "int", "strings"],
+ ["intersection", "union", "symmetric_difference"],
+ )
+ param_names = ["dtype", "method"]
def setup(self, dtype, method):
- N = 10**5
- dates_left = date_range('1/1/2000', periods=N, freq='T')
- fmt = '%Y-%m-%d %H:%M:%S'
+ N = 10 ** 5
+ dates_left = date_range("1/1/2000", periods=N, freq="T")
+ fmt = "%Y-%m-%d %H:%M:%S"
date_str_left = Index(dates_left.strftime(fmt))
int_left = Index(np.arange(N))
str_left = tm.makeStringIndex(N)
- data = {'datetime': {'left': dates_left, 'right': dates_left[:-1]},
- 'date_string': {'left': date_str_left,
- 'right': date_str_left[:-1]},
- 'int': {'left': int_left, 'right': int_left[:-1]},
- 'strings': {'left': str_left, 'right': str_left[:-1]}}
- self.left = data[dtype]['left']
- self.right = data[dtype]['right']
+ data = {
+ "datetime": {"left": dates_left, "right": dates_left[:-1]},
+ "date_string": {"left": date_str_left, "right": date_str_left[:-1]},
+ "int": {"left": int_left, "right": int_left[:-1]},
+ "strings": {"left": str_left, "right": str_left[:-1]},
+ }
+ self.left = data[dtype]["left"]
+ self.right = data[dtype]["right"]
def time_operation(self, dtype, method):
getattr(self.left, method)(self.right)
class SetDisjoint:
-
def setup(self):
- N = 10**5
+ N = 10 ** 5
B = N + 20000
self.datetime_left = DatetimeIndex(range(N))
self.datetime_right = DatetimeIndex(range(N, B))
@@ -42,9 +51,8 @@ def time_datetime_difference_disjoint(self):
class Datetime:
-
def setup(self):
- self.dr = date_range('20000101', freq='D', periods=10000)
+ self.dr = date_range("20000101", freq="D", periods=10000)
def time_is_dates_only(self):
self.dr._is_dates_only
@@ -52,12 +60,12 @@ def time_is_dates_only(self):
class Ops:
- params = ['float', 'int']
- param_names = ['dtype']
+ params = ["float", "int"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**6
- indexes = {'int': 'makeIntIndex', 'float': 'makeFloatIndex'}
+ N = 10 ** 6
+ indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"}
self.index = getattr(tm, indexes[dtype])(N)
def time_add(self, dtype):
@@ -77,10 +85,9 @@ def time_modulo(self, dtype):
class Range:
-
def setup(self):
- self.idx_inc = RangeIndex(start=0, stop=10**7, step=3)
- self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3)
+ self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3)
+ self.idx_dec = RangeIndex(start=10 ** 7, stop=-1, step=-3)
def time_max(self):
self.idx_inc.max()
@@ -102,7 +109,6 @@ def time_get_loc_dec(self):
class IndexAppend:
-
def setup(self):
N = 10000
@@ -132,19 +138,20 @@ def time_append_obj_list(self):
class Indexing:
- params = ['String', 'Float', 'Int']
- param_names = ['dtype']
+ params = ["String", "Float", "Int"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**6
- self.idx = getattr(tm, 'make{}Index'.format(dtype))(N)
+ N = 10 ** 6
+ self.idx = getattr(tm, "make{}Index".format(dtype))(N)
self.array_mask = (np.arange(N) % 3) == 0
self.series_mask = Series(self.array_mask)
self.sorted = self.idx.sort_values()
half = N // 2
self.non_unique = self.idx[:half].append(self.idx[:half])
- self.non_unique_sorted = (self.sorted[:half].append(self.sorted[:half])
- .sort_values())
+ self.non_unique_sorted = (
+ self.sorted[:half].append(self.sorted[:half]).sort_values()
+ )
self.key = self.sorted[N // 4]
def time_boolean_array(self, dtype):
@@ -188,7 +195,7 @@ def time_get_loc(self):
class IntervalIndexMethod:
# GH 24813
- params = [10**3, 10**5]
+ params = [10 ** 3, 10 ** 5]
def setup(self, N):
left = np.append(np.arange(N), np.array(0))
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
index 4e82fa5592529..489e5c4cd63ea 100644
--- a/asv_bench/benchmarks/indexing.py
+++ b/asv_bench/benchmarks/indexing.py
@@ -2,26 +2,37 @@
import numpy as np
import pandas.util.testing as tm
-from pandas import (Series, DataFrame, MultiIndex,
- Int64Index, UInt64Index, Float64Index,
- IntervalIndex, CategoricalIndex,
- IndexSlice, concat, date_range, option_context)
+from pandas import (
+ Series,
+ DataFrame,
+ MultiIndex,
+ Int64Index,
+ UInt64Index,
+ Float64Index,
+ IntervalIndex,
+ CategoricalIndex,
+ IndexSlice,
+ concat,
+ date_range,
+ option_context,
+)
class NumericSeriesIndexing:
params = [
(Int64Index, UInt64Index, Float64Index),
- ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
+ ("unique_monotonic_inc", "nonunique_monotonic_inc"),
]
- param_names = ['index_dtype', 'index_structure']
+ param_names = ["index_dtype", "index_structure"]
def setup(self, index, index_structure):
- N = 10**6
+ N = 10 ** 6
indices = {
- 'unique_monotonic_inc': index(range(N)),
- 'nonunique_monotonic_inc': index(
- list(range(55)) + [54] + list(range(55, N - 1))),
+ "unique_monotonic_inc": index(range(N)),
+ "nonunique_monotonic_inc": index(
+ list(range(55)) + [54] + list(range(55, N - 1))
+ ),
}
self.data = Series(np.random.rand(N), index=indices[index_structure])
self.array = np.arange(10000)
@@ -82,23 +93,25 @@ def time_loc_slice(self, index, index_structure):
class NonNumericSeriesIndexing:
params = [
- ('string', 'datetime'),
- ('unique_monotonic_inc', 'nonunique_monotonic_inc'),
+ ("string", "datetime"),
+ ("unique_monotonic_inc", "nonunique_monotonic_inc"),
]
- param_names = ['index_dtype', 'index_structure']
+ param_names = ["index_dtype", "index_structure"]
def setup(self, index, index_structure):
- N = 10**6
- indexes = {'string': tm.makeStringIndex(N),
- 'datetime': date_range('1900', periods=N, freq='s')}
+ N = 10 ** 6
+ indexes = {
+ "string": tm.makeStringIndex(N),
+ "datetime": date_range("1900", periods=N, freq="s"),
+ }
index = indexes[index]
- if index_structure == 'nonunique_monotonic_inc':
+ if index_structure == "nonunique_monotonic_inc":
index = index.insert(item=index[2], loc=2)[:-1]
self.s = Series(np.random.rand(N), index=index)
self.lbl = index[80000]
def time_getitem_label_slice(self, index, index_structure):
- self.s[:self.lbl]
+ self.s[: self.lbl]
def time_getitem_pos_slice(self, index, index_structure):
self.s[:80000]
@@ -115,12 +128,10 @@ def time_getitem_list_like(self, index, index_structure):
class DataFrameStringIndexing:
-
def setup(self):
index = tm.makeStringIndex(1000)
columns = tm.makeStringIndex(30)
- self.df = DataFrame(np.random.randn(1000, 30), index=index,
- columns=columns)
+ self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns)
self.idx_scalar = index[100]
self.col_scalar = columns[10]
self.bool_indexer = self.df[self.col_scalar] > 0
@@ -147,7 +158,6 @@ def time_boolean_rows_object(self):
class DataFrameNumericIndexing:
-
def setup(self):
self.idx_dupe = np.array(range(30)) * 99
self.df = DataFrame(np.random.randn(10000, 5))
@@ -172,13 +182,15 @@ def time_bool_indexer(self):
class Take:
- params = ['int', 'datetime']
- param_names = ['index']
+ params = ["int", "datetime"]
+ param_names = ["index"]
def setup(self, index):
N = 100000
- indexes = {'int': Int64Index(np.arange(N)),
- 'datetime': date_range('2011-01-01', freq='S', periods=N)}
+ indexes = {
+ "int": Int64Index(np.arange(N)),
+ "datetime": date_range("2011-01-01", freq="S", periods=N),
+ }
index = indexes[index]
self.s = Series(np.random.rand(N), index=index)
self.indexer = [True, False, True, True, False] * 20000
@@ -188,22 +200,24 @@ def time_take(self, index):
class MultiIndexing:
-
def setup(self):
mi = MultiIndex.from_product([range(1000), range(1000)])
self.s = Series(np.random.randn(1000000), index=mi)
self.df = DataFrame(self.s)
n = 100000
- self.mdt = DataFrame({'A': np.random.choice(range(10000, 45000, 1000),
- n),
- 'B': np.random.choice(range(10, 400), n),
- 'C': np.random.choice(range(1, 150), n),
- 'D': np.random.choice(range(10000, 45000), n),
- 'x': np.random.choice(range(400), n),
- 'y': np.random.choice(range(25), n)})
+ self.mdt = DataFrame(
+ {
+ "A": np.random.choice(range(10000, 45000, 1000), n),
+ "B": np.random.choice(range(10, 400), n),
+ "C": np.random.choice(range(1, 150), n),
+ "D": np.random.choice(range(10000, 45000), n),
+ "x": np.random.choice(range(400), n),
+ "y": np.random.choice(range(25), n),
+ }
+ )
self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000]
- self.mdt = self.mdt.set_index(['A', 'B', 'C', 'D']).sort_index()
+ self.mdt = self.mdt.set_index(["A", "B", "C", "D"]).sort_index()
def time_series_ix(self):
self.s.ix[999]
@@ -216,7 +230,6 @@ def time_index_slice(self):
class IntervalIndexing:
-
def setup_cache(self):
idx = IntervalIndex.from_breaks(np.arange(1000001))
monotonic = Series(np.arange(1000000), index=idx)
@@ -237,29 +250,30 @@ def time_loc_list(self, monotonic):
class CategoricalIndexIndexing:
- params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic']
- param_names = ['index']
+ params = ["monotonic_incr", "monotonic_decr", "non_monotonic"]
+ param_names = ["index"]
def setup(self, index):
- N = 10**5
- values = list('a' * N + 'b' * N + 'c' * N)
+ N = 10 ** 5
+ values = list("a" * N + "b" * N + "c" * N)
indices = {
- 'monotonic_incr': CategoricalIndex(values),
- 'monotonic_decr': CategoricalIndex(reversed(values)),
- 'non_monotonic': CategoricalIndex(list('abc' * N))}
+ "monotonic_incr": CategoricalIndex(values),
+ "monotonic_decr": CategoricalIndex(reversed(values)),
+ "non_monotonic": CategoricalIndex(list("abc" * N)),
+ }
self.data = indices[index]
self.int_scalar = 10000
self.int_list = list(range(10000))
- self.cat_scalar = 'b'
- self.cat_list = ['a', 'c']
+ self.cat_scalar = "b"
+ self.cat_list = ["a", "c"]
def time_getitem_scalar(self, index):
self.data[self.int_scalar]
def time_getitem_slice(self, index):
- self.data[:self.int_scalar]
+ self.data[: self.int_scalar]
def time_getitem_list_like(self, index):
self.data[[self.int_scalar]]
@@ -278,7 +292,6 @@ def time_get_indexer_list(self, index):
class MethodLookup:
-
def setup_cache(self):
s = Series()
return s
@@ -294,40 +307,36 @@ def time_lookup_loc(self, s):
class GetItemSingleColumn:
-
def setup(self):
- self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=['A'])
+ self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=["A"])
self.df_int_col = DataFrame(np.random.randn(3000, 1))
def time_frame_getitem_single_column_label(self):
- self.df_string_col['A']
+ self.df_string_col["A"]
def time_frame_getitem_single_column_int(self):
self.df_int_col[0]
class AssignTimeseriesIndex:
-
def setup(self):
N = 100000
- idx = date_range('1/1/2000', periods=N, freq='H')
- self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx)
+ idx = date_range("1/1/2000", periods=N, freq="H")
+ self.df = DataFrame(np.random.randn(N, 1), columns=["A"], index=idx)
def time_frame_assign_timeseries_index(self):
- self.df['date'] = self.df.index
+ self.df["date"] = self.df.index
class InsertColumns:
-
def setup(self):
- self.N = 10**3
+ self.N = 10 ** 3
self.df = DataFrame(index=range(self.N))
def time_insert(self):
np.random.seed(1234)
for i in range(100):
- self.df.insert(0, i, np.random.randn(self.N),
- allow_duplicates=True)
+ self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True)
def time_assign_with_setitem(self):
np.random.seed(1234)
@@ -337,18 +346,18 @@ def time_assign_with_setitem(self):
class ChainIndexing:
- params = [None, 'warn']
- param_names = ['mode']
+ params = [None, "warn"]
+ param_names = ["mode"]
def setup(self, mode):
self.N = 1000000
def time_chained_indexing(self, mode):
with warnings.catch_warnings(record=True):
- with option_context('mode.chained_assignment', mode):
- df = DataFrame({'A': np.arange(self.N), 'B': 'foo'})
+ with option_context("mode.chained_assignment", mode):
+ df = DataFrame({"A": np.arange(self.N), "B": "foo"})
df2 = df[df.A > self.N // 2]
- df2['C'] = 1.0
+ df2["C"] = 1.0
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py
index 5655701781846..44a22dfa77791 100644
--- a/asv_bench/benchmarks/indexing_engines.py
+++ b/asv_bench/benchmarks/indexing_engines.py
@@ -5,33 +5,40 @@
def _get_numeric_engines():
engine_names = [
- ('Int64Engine', np.int64), ('Int32Engine', np.int32),
- ('Int16Engine', np.int16), ('Int8Engine', np.int8),
- ('UInt64Engine', np.uint64), ('UInt32Engine', np.uint32),
- ('UInt16engine', np.uint16), ('UInt8Engine', np.uint8),
- ('Float64Engine', np.float64), ('Float32Engine', np.float32),
+ ("Int64Engine", np.int64),
+ ("Int32Engine", np.int32),
+ ("Int16Engine", np.int16),
+ ("Int8Engine", np.int8),
+ ("UInt64Engine", np.uint64),
+ ("UInt32Engine", np.uint32),
+ ("UInt16engine", np.uint16),
+ ("UInt8Engine", np.uint8),
+ ("Float64Engine", np.float64),
+ ("Float32Engine", np.float32),
+ ]
+ return [
+ (getattr(libindex, engine_name), dtype)
+ for engine_name, dtype in engine_names
+ if hasattr(libindex, engine_name)
]
- return [(getattr(libindex, engine_name), dtype)
- for engine_name, dtype in engine_names
- if hasattr(libindex, engine_name)]
class NumericEngineIndexing:
- params = [_get_numeric_engines(),
- ['monotonic_incr', 'monotonic_decr', 'non_monotonic'],
- ]
- param_names = ['engine_and_dtype', 'index_type']
+ params = [
+ _get_numeric_engines(),
+ ["monotonic_incr", "monotonic_decr", "non_monotonic"],
+ ]
+ param_names = ["engine_and_dtype", "index_type"]
def setup(self, engine_and_dtype, index_type):
engine, dtype = engine_and_dtype
- N = 10**5
+ N = 10 ** 5
values = list([1] * N + [2] * N + [3] * N)
arr = {
- 'monotonic_incr': np.array(values, dtype=dtype),
- 'monotonic_decr': np.array(list(reversed(values)),
- dtype=dtype),
- 'non_monotonic': np.array([1, 2, 3] * N, dtype=dtype),
+ "monotonic_incr": np.array(values, dtype=dtype),
+ "monotonic_decr": np.array(list(reversed(values)), dtype=dtype),
+ "non_monotonic": np.array([1, 2, 3] * N, dtype=dtype),
}[index_type]
self.data = engine(lambda: arr, len(arr))
@@ -44,21 +51,21 @@ def time_get_loc(self, engine_and_dtype, index_type):
class ObjectEngineIndexing:
- params = [('monotonic_incr', 'monotonic_decr', 'non_monotonic')]
- param_names = ['index_type']
+ params = [("monotonic_incr", "monotonic_decr", "non_monotonic")]
+ param_names = ["index_type"]
def setup(self, index_type):
- N = 10**5
- values = list('a' * N + 'b' * N + 'c' * N)
+ N = 10 ** 5
+ values = list("a" * N + "b" * N + "c" * N)
arr = {
- 'monotonic_incr': np.array(values, dtype=object),
- 'monotonic_decr': np.array(list(reversed(values)), dtype=object),
- 'non_monotonic': np.array(list('abc') * N, dtype=object),
+ "monotonic_incr": np.array(values, dtype=object),
+ "monotonic_decr": np.array(list(reversed(values)), dtype=object),
+ "non_monotonic": np.array(list("abc") * N, dtype=object),
}[index_type]
self.data = libindex.ObjectEngine(lambda: arr, len(arr))
# code belows avoids populating the mapping etc. while timing.
- self.data.get_loc('b')
+ self.data.get_loc("b")
def time_get_loc(self, index_type):
- self.data.get_loc('b')
+ self.data.get_loc("b")
diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py
index 065c82207d251..66ef4f2aec380 100644
--- a/asv_bench/benchmarks/inference.py
+++ b/asv_bench/benchmarks/inference.py
@@ -8,56 +8,57 @@
class NumericInferOps:
# from GH 7332
params = numeric_dtypes
- param_names = ['dtype']
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 5 * 10**5
- self.df = DataFrame({'A': np.arange(N).astype(dtype),
- 'B': np.arange(N).astype(dtype)})
+ N = 5 * 10 ** 5
+ self.df = DataFrame(
+ {"A": np.arange(N).astype(dtype), "B": np.arange(N).astype(dtype)}
+ )
def time_add(self, dtype):
- self.df['A'] + self.df['B']
+ self.df["A"] + self.df["B"]
def time_subtract(self, dtype):
- self.df['A'] - self.df['B']
+ self.df["A"] - self.df["B"]
def time_multiply(self, dtype):
- self.df['A'] * self.df['B']
+ self.df["A"] * self.df["B"]
def time_divide(self, dtype):
- self.df['A'] / self.df['B']
+ self.df["A"] / self.df["B"]
def time_modulo(self, dtype):
- self.df['A'] % self.df['B']
+ self.df["A"] % self.df["B"]
class DateInferOps:
# from GH 7332
def setup_cache(self):
- N = 5 * 10**5
- df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')})
- df['timedelta'] = df['datetime64'] - df['datetime64']
+ N = 5 * 10 ** 5
+ df = DataFrame({"datetime64": np.arange(N).astype("datetime64[ms]")})
+ df["timedelta"] = df["datetime64"] - df["datetime64"]
return df
def time_subtract_datetimes(self, df):
- df['datetime64'] - df['datetime64']
+ df["datetime64"] - df["datetime64"]
def time_timedelta_plus_datetime(self, df):
- df['timedelta'] + df['datetime64']
+ df["timedelta"] + df["datetime64"]
def time_add_timedeltas(self, df):
- df['timedelta'] + df['timedelta']
+ df["timedelta"] + df["timedelta"]
class ToNumeric:
- params = ['ignore', 'coerce']
- param_names = ['errors']
+ params = ["ignore", "coerce"]
+ param_names = ["errors"]
def setup(self, errors):
N = 10000
self.float = Series(np.random.randn(N))
- self.numstr = self.float.astype('str')
+ self.numstr = self.float.astype("str")
self.str = Series(tm.makeStringIndex(N))
def time_from_float(self, errors):
@@ -72,21 +73,32 @@ def time_from_str(self, errors):
class ToNumericDowncast:
- param_names = ['dtype', 'downcast']
- params = [['string-float', 'string-int', 'string-nint', 'datetime64',
- 'int-list', 'int32'],
- [None, 'integer', 'signed', 'unsigned', 'float']]
+ param_names = ["dtype", "downcast"]
+ params = [
+ [
+ "string-float",
+ "string-int",
+ "string-nint",
+ "datetime64",
+ "int-list",
+ "int32",
+ ],
+ [None, "integer", "signed", "unsigned", "float"],
+ ]
N = 500000
N2 = int(N / 2)
- data_dict = {'string-int': ['1'] * N2 + [2] * N2,
- 'string-nint': ['-1'] * N2 + [2] * N2,
- 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'],
- dtype='datetime64[D]'), N),
- 'string-float': ['1.1'] * N2 + [2] * N2,
- 'int-list': [1] * N2 + [2] * N2,
- 'int32': np.repeat(np.int32(1), N)}
+ data_dict = {
+ "string-int": ["1"] * N2 + [2] * N2,
+ "string-nint": ["-1"] * N2 + [2] * N2,
+ "datetime64": np.repeat(
+ np.array(["1970-01-01", "1970-01-02"], dtype="datetime64[D]"), N
+ ),
+ "string-float": ["1.1"] * N2 + [2] * N2,
+ "int-list": [1] * N2 + [2] * N2,
+ "int32": np.repeat(np.int32(1), N),
+ }
def setup(self, dtype, downcast):
self.data = self.data_dict[dtype]
@@ -96,10 +108,9 @@ def time_downcast(self, dtype, downcast):
class MaybeConvertNumeric:
-
def setup_cache(self):
- N = 10**6
- arr = np.repeat([2**63], N) + np.arange(N).astype('uint64')
+ N = 10 ** 6
+ arr = np.repeat([2 ** 63], N) + np.arange(N).astype("uint64")
data = arr.astype(object)
data[1::2] = arr[1::2].astype(str)
data[-1] = -1
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
index fbb96380a5813..4525e504fc4dd 100644
--- a/asv_bench/benchmarks/io/csv.py
+++ b/asv_bench/benchmarks/io/csv.py
@@ -11,27 +11,31 @@
class ToCSV(BaseIO):
- fname = '__test__.csv'
- params = ['wide', 'long', 'mixed']
- param_names = ['kind']
+ fname = "__test__.csv"
+ params = ["wide", "long", "mixed"]
+ param_names = ["kind"]
def setup(self, kind):
wide_frame = DataFrame(np.random.randn(3000, 30))
- long_frame = DataFrame({'A': np.arange(50000),
- 'B': np.arange(50000) + 1.,
- 'C': np.arange(50000) + 2.,
- 'D': np.arange(50000) + 3.})
- mixed_frame = DataFrame({'float': np.random.randn(5000),
- 'int': np.random.randn(5000).astype(int),
- 'bool': (np.arange(5000) % 2) == 0,
- 'datetime': date_range('2001',
- freq='s',
- periods=5000),
- 'object': ['foo'] * 5000})
- mixed_frame.loc[30:500, 'float'] = np.nan
- data = {'wide': wide_frame,
- 'long': long_frame,
- 'mixed': mixed_frame}
+ long_frame = DataFrame(
+ {
+ "A": np.arange(50000),
+ "B": np.arange(50000) + 1.0,
+ "C": np.arange(50000) + 2.0,
+ "D": np.arange(50000) + 3.0,
+ }
+ )
+ mixed_frame = DataFrame(
+ {
+ "float": np.random.randn(5000),
+ "int": np.random.randn(5000).astype(int),
+ "bool": (np.arange(5000) % 2) == 0,
+ "datetime": date_range("2001", freq="s", periods=5000),
+ "object": ["foo"] * 5000,
+ }
+ )
+ mixed_frame.loc[30:500, "float"] = np.nan
+ data = {"wide": wide_frame, "long": long_frame, "mixed": mixed_frame}
self.df = data[kind]
def time_frame(self, kind):
@@ -40,36 +44,39 @@ def time_frame(self, kind):
class ToCSVDatetime(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
def setup(self):
- rng = date_range('1/1/2000', periods=1000)
+ rng = date_range("1/1/2000", periods=1000)
self.data = DataFrame(rng, index=rng)
def time_frame_date_formatting(self):
- self.data.to_csv(self.fname, date_format='%Y%m%d')
+ self.data.to_csv(self.fname, date_format="%Y%m%d")
class ToCSVDatetimeBig(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
timeout = 1500
params = [1000, 10000, 100000]
- param_names = ['obs']
+ param_names = ["obs"]
def setup(self, obs):
- d = '2018-11-29'
- dt = '2018-11-26 11:18:27.0'
- self.data = DataFrame({'dt': [np.datetime64(dt)] * obs,
- 'd': [np.datetime64(d)] * obs,
- 'r': [np.random.uniform()] * obs})
+ d = "2018-11-29"
+ dt = "2018-11-26 11:18:27.0"
+ self.data = DataFrame(
+ {
+ "dt": [np.datetime64(dt)] * obs,
+ "d": [np.datetime64(d)] * obs,
+ "r": [np.random.uniform()] * obs,
+ }
+ )
def time_frame(self, obs):
self.data.to_csv(self.fname)
class StringIORewind:
-
def data(self, stringio_object):
stringio_object.seek(0)
return stringio_object
@@ -77,68 +84,84 @@ def data(self, stringio_object):
class ReadCSVDInferDatetimeFormat(StringIORewind):
- params = ([True, False], ['custom', 'iso8601', 'ymd'])
- param_names = ['infer_datetime_format', 'format']
+ params = ([True, False], ["custom", "iso8601", "ymd"])
+ param_names = ["infer_datetime_format", "format"]
def setup(self, infer_datetime_format, format):
- rng = date_range('1/1/2000', periods=1000)
- formats = {'custom': '%m/%d/%Y %H:%M:%S.%f',
- 'iso8601': '%Y-%m-%d %H:%M:%S',
- 'ymd': '%Y%m%d'}
+ rng = date_range("1/1/2000", periods=1000)
+ formats = {
+ "custom": "%m/%d/%Y %H:%M:%S.%f",
+ "iso8601": "%Y-%m-%d %H:%M:%S",
+ "ymd": "%Y%m%d",
+ }
dt_format = formats[format]
- self.StringIO_input = StringIO('\n'.join(
- rng.strftime(dt_format).tolist()))
+ self.StringIO_input = StringIO("\n".join(rng.strftime(dt_format).tolist()))
def time_read_csv(self, infer_datetime_format, format):
- read_csv(self.data(self.StringIO_input),
- header=None, names=['foo'], parse_dates=['foo'],
- infer_datetime_format=infer_datetime_format)
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ names=["foo"],
+ parse_dates=["foo"],
+ infer_datetime_format=infer_datetime_format,
+ )
class ReadCSVConcatDatetime(StringIORewind):
- iso8601 = '%Y-%m-%d %H:%M:%S'
+ iso8601 = "%Y-%m-%d %H:%M:%S"
def setup(self):
- rng = date_range('1/1/2000', periods=50000, freq='S')
- self.StringIO_input = StringIO('\n'.join(
- rng.strftime(self.iso8601).tolist()))
+ rng = date_range("1/1/2000", periods=50000, freq="S")
+ self.StringIO_input = StringIO("\n".join(rng.strftime(self.iso8601).tolist()))
def time_read_csv(self):
- read_csv(self.data(self.StringIO_input),
- header=None, names=['foo'], parse_dates=['foo'],
- infer_datetime_format=False)
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ names=["foo"],
+ parse_dates=["foo"],
+ infer_datetime_format=False,
+ )
class ReadCSVConcatDatetimeBadDateValue(StringIORewind):
- params = (['nan', '0', ''],)
- param_names = ['bad_date_value']
+ params = (["nan", "0", ""],)
+ param_names = ["bad_date_value"]
def setup(self, bad_date_value):
- self.StringIO_input = StringIO(('%s,\n' % bad_date_value) * 50000)
+ self.StringIO_input = StringIO(("%s,\n" % bad_date_value) * 50000)
def time_read_csv(self, bad_date_value):
- read_csv(self.data(self.StringIO_input),
- header=None, names=['foo', 'bar'], parse_dates=['foo'],
- infer_datetime_format=False)
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ names=["foo", "bar"],
+ parse_dates=["foo"],
+ infer_datetime_format=False,
+ )
class ReadCSVSkipRows(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
params = [None, 10000]
- param_names = ['skiprows']
+ param_names = ["skiprows"]
def setup(self, skiprows):
N = 20000
index = tm.makeStringIndex(N)
- df = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N),
- 'string1': ['foo'] * N,
- 'bool1': [True] * N,
- 'int1': np.random.randint(0, N, size=N)},
- index=index)
+ df = DataFrame(
+ {
+ "float1": np.random.randn(N),
+ "float2": np.random.randn(N),
+ "string1": ["foo"] * N,
+ "bool1": [True] * N,
+ "int1": np.random.randint(0, N, size=N),
+ },
+ index=index,
+ )
df.to_csv(self.fname)
def time_skipprows(self, skiprows):
@@ -146,31 +169,31 @@ def time_skipprows(self, skiprows):
class ReadUint64Integers(StringIORewind):
-
def setup(self):
- self.na_values = [2**63 + 500]
- arr = np.arange(10000).astype('uint64') + 2**63
- self.data1 = StringIO('\n'.join(arr.astype(str).tolist()))
+ self.na_values = [2 ** 63 + 500]
+ arr = np.arange(10000).astype("uint64") + 2 ** 63
+ self.data1 = StringIO("\n".join(arr.astype(str).tolist()))
arr = arr.astype(object)
arr[500] = -1
- self.data2 = StringIO('\n'.join(arr.astype(str).tolist()))
+ self.data2 = StringIO("\n".join(arr.astype(str).tolist()))
def time_read_uint64(self):
- read_csv(self.data(self.data1), header=None, names=['foo'])
+ read_csv(self.data(self.data1), header=None, names=["foo"])
def time_read_uint64_neg_values(self):
- read_csv(self.data(self.data2), header=None, names=['foo'])
+ read_csv(self.data(self.data2), header=None, names=["foo"])
def time_read_uint64_na_values(self):
- read_csv(self.data(self.data1), header=None, names=['foo'],
- na_values=self.na_values)
+ read_csv(
+ self.data(self.data1), header=None, names=["foo"], na_values=self.na_values
+ )
class ReadCSVThousands(BaseIO):
- fname = '__test__.csv'
- params = ([',', '|'], [None, ','])
- param_names = ['sep', 'thousands']
+ fname = "__test__.csv"
+ params = ([",", "|"], [None, ","])
+ param_names = ["sep", "thousands"]
def setup(self, sep, thousands):
N = 10000
@@ -178,8 +201,8 @@ def setup(self, sep, thousands):
data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
df = DataFrame(data)
if thousands is not None:
- fmt = ':{}'.format(thousands)
- fmt = '{' + fmt + '}'
+ fmt = ":{}".format(thousands)
+ fmt = "{" + fmt + "}"
df = df.applymap(lambda x: fmt.format(x))
df.to_csv(self.fname, sep=sep)
@@ -188,57 +211,68 @@ def time_thousands(self, sep, thousands):
class ReadCSVComment(StringIORewind):
-
def setup(self):
- data = ['A,B,C'] + (['1,2,3 # comment'] * 100000)
- self.StringIO_input = StringIO('\n'.join(data))
+ data = ["A,B,C"] + (["1,2,3 # comment"] * 100000)
+ self.StringIO_input = StringIO("\n".join(data))
def time_comment(self):
- read_csv(self.data(self.StringIO_input), comment='#',
- header=None, names=list('abc'))
+ read_csv(
+ self.data(self.StringIO_input), comment="#", header=None, names=list("abc")
+ )
class ReadCSVFloatPrecision(StringIORewind):
- params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip'])
- param_names = ['sep', 'decimal', 'float_precision']
+ params = ([",", ";"], [".", "_"], [None, "high", "round_trip"])
+ param_names = ["sep", "decimal", "float_precision"]
def setup(self, sep, decimal, float_precision):
- floats = [''.join(random.choice(string.digits) for _ in range(28))
- for _ in range(15)]
- rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n'
+ floats = [
+ "".join(random.choice(string.digits) for _ in range(28)) for _ in range(15)
+ ]
+ rows = sep.join(["0{}".format(decimal) + "{}"] * 3) + "\n"
data = rows * 5
data = data.format(*floats) * 200 # 1000 x 3 strings csv
self.StringIO_input = StringIO(data)
def time_read_csv(self, sep, decimal, float_precision):
- read_csv(self.data(self.StringIO_input), sep=sep, header=None,
- names=list('abc'), float_precision=float_precision)
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=sep,
+ header=None,
+ names=list("abc"),
+ float_precision=float_precision,
+ )
def time_read_csv_python_engine(self, sep, decimal, float_precision):
- read_csv(self.data(self.StringIO_input), sep=sep, header=None,
- engine='python', float_precision=None, names=list('abc'))
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=sep,
+ header=None,
+ engine="python",
+ float_precision=None,
+ names=list("abc"),
+ )
class ReadCSVCategorical(BaseIO):
- fname = '__test__.csv'
+ fname = "__test__.csv"
def setup(self):
N = 100000
- group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
- df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc'))
+ group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"]
+ df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc"))
df.to_csv(self.fname, index=False)
def time_convert_post(self):
read_csv(self.fname).apply(Categorical)
def time_convert_direct(self):
- read_csv(self.fname, dtype='category')
+ read_csv(self.fname, dtype="category")
class ReadCSVParseDates(StringIORewind):
-
def setup(self):
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
@@ -246,34 +280,47 @@ def setup(self):
{},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n
{},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n
"""
- two_cols = ['KORD,19990127'] * 5
+ two_cols = ["KORD,19990127"] * 5
data = data.format(*two_cols)
self.StringIO_input = StringIO(data)
def time_multiple_date(self):
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- names=list(string.digits[:9]),
- parse_dates=[[1, 2], [1, 3]])
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ names=list(string.digits[:9]),
+ parse_dates=[[1, 2], [1, 3]],
+ )
def time_baseline(self):
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- parse_dates=[1],
- names=list(string.digits[:9]))
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ parse_dates=[1],
+ names=list(string.digits[:9]),
+ )
class ReadCSVCachedParseDates(StringIORewind):
params = ([True, False],)
- param_names = ['do_cache']
+ param_names = ["do_cache"]
def setup(self, do_cache):
- data = ('\n'.join('10/{}'.format(year)
- for year in range(2000, 2100)) + '\n') * 10
+ data = (
+ "\n".join("10/{}".format(year) for year in range(2000, 2100)) + "\n"
+ ) * 10
self.StringIO_input = StringIO(data)
def time_read_csv_cached(self, do_cache):
try:
- read_csv(self.data(self.StringIO_input), header=None,
- parse_dates=[0], cache_dates=do_cache)
+ read_csv(
+ self.data(self.StringIO_input),
+ header=None,
+ parse_dates=[0],
+ cache_dates=do_cache,
+ )
except TypeError:
# cache_dates is a new keyword in 0.25
pass
@@ -299,12 +346,12 @@ def mem_parser_chunks(self):
class ReadCSVParseSpecialDate(StringIORewind):
- params = (['mY', 'mdY', 'hm'],)
- param_names = ['value']
+ params = (["mY", "mdY", "hm"],)
+ param_names = ["value"]
objects = {
- 'mY': '01-2019\n10-2019\n02/2000\n',
- 'mdY': '12/02/2010\n',
- 'hm': '21:34\n'
+ "mY": "01-2019\n10-2019\n02/2000\n",
+ "mdY": "12/02/2010\n",
+ "hm": "21:34\n",
}
def setup(self, value):
@@ -313,38 +360,50 @@ def setup(self, value):
self.StringIO_input = StringIO(data)
def time_read_special_date(self, value):
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- names=['Date'], parse_dates=['Date'])
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ names=["Date"],
+ parse_dates=["Date"],
+ )
class ParseDateComparison(StringIORewind):
params = ([False, True],)
- param_names = ['cache_dates']
+ param_names = ["cache_dates"]
def setup(self, cache_dates):
count_elem = 10000
- data = '12-02-2010\n' * count_elem
+ data = "12-02-2010\n" * count_elem
self.StringIO_input = StringIO(data)
def time_read_csv_dayfirst(self, cache_dates):
try:
- read_csv(self.data(self.StringIO_input), sep=',', header=None,
- names=['Date'], parse_dates=['Date'],
- cache_dates=cache_dates,
- dayfirst=True)
+ read_csv(
+ self.data(self.StringIO_input),
+ sep=",",
+ header=None,
+ names=["Date"],
+ parse_dates=["Date"],
+ cache_dates=cache_dates,
+ dayfirst=True,
+ )
except TypeError:
# cache_dates is a new keyword in 0.25
pass
def time_to_datetime_dayfirst(self, cache_dates):
- df = read_csv(self.data(self.StringIO_input),
- dtype={'date': str}, names=['date'])
- to_datetime(df['date'], cache=cache_dates, dayfirst=True)
+ df = read_csv(
+ self.data(self.StringIO_input), dtype={"date": str}, names=["date"]
+ )
+ to_datetime(df["date"], cache=cache_dates, dayfirst=True)
def time_to_datetime_format_DD_MM_YYYY(self, cache_dates):
- df = read_csv(self.data(self.StringIO_input),
- dtype={'date': str}, names=['date'])
- to_datetime(df['date'], cache=cache_dates, format='%d-%m-%Y')
+ df = read_csv(
+ self.data(self.StringIO_input), dtype={"date": str}, names=["date"]
+ )
+ to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y")
from ..pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py
index 1decb83f2f723..12e70f84e5203 100644
--- a/asv_bench/benchmarks/io/excel.py
+++ b/asv_bench/benchmarks/io/excel.py
@@ -6,19 +6,21 @@
class Excel:
- params = ['openpyxl', 'xlsxwriter', 'xlwt']
- param_names = ['engine']
+ params = ["openpyxl", "xlsxwriter", "xlwt"]
+ param_names = ["engine"]
def setup(self, engine):
N = 2000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
self.bio_read = BytesIO()
self.writer_read = ExcelWriter(self.bio_read, engine=engine)
- self.df.to_excel(self.writer_read, sheet_name='Sheet1')
+ self.df.to_excel(self.writer_read, sheet_name="Sheet1")
self.writer_read.save()
self.bio_read.seek(0)
@@ -29,7 +31,7 @@ def time_write_excel(self, engine):
bio_write = BytesIO()
bio_write.seek(0)
writer_write = ExcelWriter(bio_write, engine=engine)
- self.df.to_excel(writer_write, sheet_name='Sheet1')
+ self.df.to_excel(writer_write, sheet_name="Sheet1")
writer_write.save()
diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py
index a5dc28eb9508c..2874a7889156b 100644
--- a/asv_bench/benchmarks/io/hdf.py
+++ b/asv_bench/benchmarks/io/hdf.py
@@ -6,86 +6,92 @@
class HDFStoreDataFrame(BaseIO):
-
def setup(self):
N = 25000
index = tm.makeStringIndex(N)
- self.df = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N)},
- index=index)
- self.df_mixed = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N),
- 'string1': ['foo'] * N,
- 'bool1': [True] * N,
- 'int1': np.random.randint(0, N, size=N)},
- index=index)
+ self.df = DataFrame(
+ {"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=index
+ )
+ self.df_mixed = DataFrame(
+ {
+ "float1": np.random.randn(N),
+ "float2": np.random.randn(N),
+ "string1": ["foo"] * N,
+ "bool1": [True] * N,
+ "int1": np.random.randint(0, N, size=N),
+ },
+ index=index,
+ )
self.df_wide = DataFrame(np.random.randn(N, 100))
self.start_wide = self.df_wide.index[10000]
self.stop_wide = self.df_wide.index[15000]
- self.df2 = DataFrame({'float1': np.random.randn(N),
- 'float2': np.random.randn(N)},
- index=date_range('1/1/2000', periods=N))
+ self.df2 = DataFrame(
+ {"float1": np.random.randn(N), "float2": np.random.randn(N)},
+ index=date_range("1/1/2000", periods=N),
+ )
self.start = self.df2.index[10000]
self.stop = self.df2.index[15000]
- self.df_wide2 = DataFrame(np.random.randn(N, 100),
- index=date_range('1/1/2000', periods=N))
- self.df_dc = DataFrame(np.random.randn(N, 10),
- columns=['C%03d' % i for i in range(10)])
+ self.df_wide2 = DataFrame(
+ np.random.randn(N, 100), index=date_range("1/1/2000", periods=N)
+ )
+ self.df_dc = DataFrame(
+ np.random.randn(N, 10), columns=["C%03d" % i for i in range(10)]
+ )
- self.fname = '__test__.h5'
+ self.fname = "__test__.h5"
self.store = HDFStore(self.fname)
- self.store.put('fixed', self.df)
- self.store.put('fixed_mixed', self.df_mixed)
- self.store.append('table', self.df2)
- self.store.append('table_mixed', self.df_mixed)
- self.store.append('table_wide', self.df_wide)
- self.store.append('table_wide2', self.df_wide2)
+ self.store.put("fixed", self.df)
+ self.store.put("fixed_mixed", self.df_mixed)
+ self.store.append("table", self.df2)
+ self.store.append("table_mixed", self.df_mixed)
+ self.store.append("table_wide", self.df_wide)
+ self.store.append("table_wide2", self.df_wide2)
def teardown(self):
self.store.close()
self.remove(self.fname)
def time_read_store(self):
- self.store.get('fixed')
+ self.store.get("fixed")
def time_read_store_mixed(self):
- self.store.get('fixed_mixed')
+ self.store.get("fixed_mixed")
def time_write_store(self):
- self.store.put('fixed_write', self.df)
+ self.store.put("fixed_write", self.df)
def time_write_store_mixed(self):
- self.store.put('fixed_mixed_write', self.df_mixed)
+ self.store.put("fixed_mixed_write", self.df_mixed)
def time_read_store_table_mixed(self):
- self.store.select('table_mixed')
+ self.store.select("table_mixed")
def time_write_store_table_mixed(self):
- self.store.append('table_mixed_write', self.df_mixed)
+ self.store.append("table_mixed_write", self.df_mixed)
def time_read_store_table(self):
- self.store.select('table')
+ self.store.select("table")
def time_write_store_table(self):
- self.store.append('table_write', self.df)
+ self.store.append("table_write", self.df)
def time_read_store_table_wide(self):
- self.store.select('table_wide')
+ self.store.select("table_wide")
def time_write_store_table_wide(self):
- self.store.append('table_wide_write', self.df_wide)
+ self.store.append("table_wide_write", self.df_wide)
def time_write_store_table_dc(self):
- self.store.append('table_dc_write', self.df_dc, data_columns=True)
+ self.store.append("table_dc_write", self.df_dc, data_columns=True)
def time_query_store_table_wide(self):
- self.store.select('table_wide', where="index > self.start_wide and "
- "index < self.stop_wide")
+ self.store.select(
+ "table_wide", where="index > self.start_wide and " "index < self.stop_wide"
+ )
def time_query_store_table(self):
- self.store.select('table', where="index > self.start and "
- "index < self.stop")
+ self.store.select("table", where="index > self.start and " "index < self.stop")
def time_store_repr(self):
repr(self.store)
@@ -99,24 +105,26 @@ def time_store_info(self):
class HDF(BaseIO):
- params = ['table', 'fixed']
- param_names = ['format']
+ params = ["table", "fixed"]
+ param_names = ["format"]
def setup(self, format):
- self.fname = '__test__.h5'
+ self.fname = "__test__.h5"
N = 100000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
- self.df.to_hdf(self.fname, 'df', format=format)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
+ self.df.to_hdf(self.fname, "df", format=format)
def time_read_hdf(self, format):
- read_hdf(self.fname, 'df')
+ read_hdf(self.fname, "df")
def time_write_hdf(self, format):
- self.df.to_hdf(self.fname, 'df', format=format)
+ self.df.to_hdf(self.fname, "df", format=format)
from ..pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py
index 19d11e6610198..0ce42856fb14a 100644
--- a/asv_bench/benchmarks/io/json.py
+++ b/asv_bench/benchmarks/io/json.py
@@ -8,16 +8,20 @@
class ReadJSON(BaseIO):
fname = "__test__.json"
- params = (['split', 'index', 'records'], ['int', 'datetime'])
- param_names = ['orient', 'index']
+ params = (["split", "index", "records"], ["int", "datetime"])
+ param_names = ["orient", "index"]
def setup(self, orient, index):
N = 100000
- indexes = {'int': np.arange(N),
- 'datetime': date_range('20000101', periods=N, freq='H')}
- df = DataFrame(np.random.randn(N, 5),
- columns=['float_{}'.format(i) for i in range(5)],
- index=indexes[index])
+ indexes = {
+ "int": np.arange(N),
+ "datetime": date_range("20000101", periods=N, freq="H"),
+ }
+ df = DataFrame(
+ np.random.randn(N, 5),
+ columns=["float_{}".format(i) for i in range(5)],
+ index=indexes[index],
+ )
df.to_json(self.fname, orient=orient)
def time_read_json(self, orient, index):
@@ -27,71 +31,85 @@ def time_read_json(self, orient, index):
class ReadJSONLines(BaseIO):
fname = "__test_lines__.json"
- params = ['int', 'datetime']
- param_names = ['index']
+ params = ["int", "datetime"]
+ param_names = ["index"]
def setup(self, index):
N = 100000
- indexes = {'int': np.arange(N),
- 'datetime': date_range('20000101', periods=N, freq='H')}
- df = DataFrame(np.random.randn(N, 5),
- columns=['float_{}'.format(i) for i in range(5)],
- index=indexes[index])
- df.to_json(self.fname, orient='records', lines=True)
+ indexes = {
+ "int": np.arange(N),
+ "datetime": date_range("20000101", periods=N, freq="H"),
+ }
+ df = DataFrame(
+ np.random.randn(N, 5),
+ columns=["float_{}".format(i) for i in range(5)],
+ index=indexes[index],
+ )
+ df.to_json(self.fname, orient="records", lines=True)
def time_read_json_lines(self, index):
- read_json(self.fname, orient='records', lines=True)
+ read_json(self.fname, orient="records", lines=True)
def time_read_json_lines_concat(self, index):
- concat(read_json(self.fname, orient='records', lines=True,
- chunksize=25000))
+ concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
def peakmem_read_json_lines(self, index):
- read_json(self.fname, orient='records', lines=True)
+ read_json(self.fname, orient="records", lines=True)
def peakmem_read_json_lines_concat(self, index):
- concat(read_json(self.fname, orient='records', lines=True,
- chunksize=25000))
+ concat(read_json(self.fname, orient="records", lines=True, chunksize=25000))
class ToJSON(BaseIO):
fname = "__test__.json"
- params = ['split', 'columns', 'index']
- param_names = ['orient']
+ params = ["split", "columns", "index"]
+ param_names = ["orient"]
def setup(self, lines_orient):
- N = 10**5
+ N = 10 ** 5
ncols = 5
- index = date_range('20000101', periods=N, freq='H')
- timedeltas = timedelta_range(start=1, periods=N, freq='s')
- datetimes = date_range(start=1, periods=N, freq='s')
+ index = date_range("20000101", periods=N, freq="H")
+ timedeltas = timedelta_range(start=1, periods=N, freq="s")
+ datetimes = date_range(start=1, periods=N, freq="s")
ints = np.random.randint(100000000, size=N)
floats = np.random.randn(N)
strings = tm.makeStringIndex(N)
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index)
- self.df_td_int_ts = DataFrame({'td_1': timedeltas,
- 'td_2': timedeltas,
- 'int_1': ints,
- 'int_2': ints,
- 'ts_1': datetimes,
- 'ts_2': datetimes},
- index=index)
- self.df_int_floats = DataFrame({'int_1': ints,
- 'int_2': ints,
- 'int_3': ints,
- 'float_1': floats,
- 'float_2': floats,
- 'float_3': floats},
- index=index)
- self.df_int_float_str = DataFrame({'int_1': ints,
- 'int_2': ints,
- 'float_1': floats,
- 'float_2': floats,
- 'str_1': strings,
- 'str_2': strings},
- index=index)
+ self.df_td_int_ts = DataFrame(
+ {
+ "td_1": timedeltas,
+ "td_2": timedeltas,
+ "int_1": ints,
+ "int_2": ints,
+ "ts_1": datetimes,
+ "ts_2": datetimes,
+ },
+ index=index,
+ )
+ self.df_int_floats = DataFrame(
+ {
+ "int_1": ints,
+ "int_2": ints,
+ "int_3": ints,
+ "float_1": floats,
+ "float_2": floats,
+ "float_3": floats,
+ },
+ index=index,
+ )
+ self.df_int_float_str = DataFrame(
+ {
+ "int_1": ints,
+ "int_2": ints,
+ "float_1": floats,
+ "float_2": floats,
+ "str_1": strings,
+ "str_2": strings,
+ },
+ index=index,
+ )
def time_floats_with_int_index(self, orient):
self.df.to_json(self.fname, orient=orient)
@@ -109,39 +127,35 @@ def time_float_int_str(self, orient):
self.df_int_float_str.to_json(self.fname, orient=orient)
def time_floats_with_int_idex_lines(self, orient):
- self.df.to_json(self.fname, orient='records', lines=True)
+ self.df.to_json(self.fname, orient="records", lines=True)
def time_floats_with_dt_index_lines(self, orient):
- self.df_date_idx.to_json(self.fname, orient='records', lines=True)
+ self.df_date_idx.to_json(self.fname, orient="records", lines=True)
def time_delta_int_tstamp_lines(self, orient):
- self.df_td_int_ts.to_json(self.fname, orient='records', lines=True)
+ self.df_td_int_ts.to_json(self.fname, orient="records", lines=True)
def time_float_int_lines(self, orient):
- self.df_int_floats.to_json(self.fname, orient='records', lines=True)
+ self.df_int_floats.to_json(self.fname, orient="records", lines=True)
def time_float_int_str_lines(self, orient):
- self.df_int_float_str.to_json(self.fname, orient='records', lines=True)
+ self.df_int_float_str.to_json(self.fname, orient="records", lines=True)
class ToJSONMem:
-
def setup_cache(self):
df = DataFrame([[1]])
- frames = {
- 'int': df,
- 'float': df.astype(float),
- }
+ frames = {"int": df, "float": df.astype(float)}
return frames
def peakmem_int(self, frames):
- df = frames['int']
+ df = frames["int"]
for _ in range(100_000):
df.to_json()
def peakmem_float(self, frames):
- df = frames['float']
+ df = frames["float"]
for _ in range(100_000):
df.to_json()
diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py
index dc2642d920fd0..c43df7c2e91ed 100644
--- a/asv_bench/benchmarks/io/msgpack.py
+++ b/asv_bench/benchmarks/io/msgpack.py
@@ -6,15 +6,16 @@
class MSGPack(BaseIO):
-
def setup(self):
- self.fname = '__test__.msg'
+ self.fname = "__test__.msg"
N = 100000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
self.df.to_msgpack(self.fname)
def time_read_msgpack(self):
diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py
index edba0358c821a..40256e043a008 100644
--- a/asv_bench/benchmarks/io/parsers.py
+++ b/asv_bench/benchmarks/io/parsers.py
@@ -2,7 +2,9 @@
try:
from pandas._libs.tslibs.parsing import (
- _concat_date_cols, _does_string_look_like_datetime)
+ _concat_date_cols,
+ _does_string_look_like_datetime,
+ )
except ImportError:
# Avoid whole benchmark suite import failure on asv (currently 0.4)
pass
@@ -10,8 +12,8 @@
class DoesStringLookLikeDatetime(object):
- params = (['2Q2005', '0.0', '10000'],)
- param_names = ['value']
+ params = (["2Q2005", "0.0", "10000"],)
+ param_names = ["value"]
def setup(self, value):
self.objects = [value] * 1000000
@@ -23,16 +25,18 @@ def time_check_datetimes(self, value):
class ConcatDateCols(object):
- params = ([1234567890, 'AAAA'], [1, 2])
- param_names = ['value', 'dim']
+ params = ([1234567890, "AAAA"], [1, 2])
+ param_names = ["value", "dim"]
def setup(self, value, dim):
count_elem = 10000
if dim == 1:
self.object = (np.array([value] * count_elem),)
if dim == 2:
- self.object = (np.array([value] * count_elem),
- np.array([value] * count_elem))
+ self.object = (
+ np.array([value] * count_elem),
+ np.array([value] * count_elem),
+ )
def time_check_concat(self, value, dim):
_concat_date_cols(self.object)
diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py
index 74a58bbb946aa..286ac767c02e7 100644
--- a/asv_bench/benchmarks/io/pickle.py
+++ b/asv_bench/benchmarks/io/pickle.py
@@ -6,15 +6,16 @@
class Pickle(BaseIO):
-
def setup(self):
- self.fname = '__test__.pkl'
+ self.fname = "__test__.pkl"
N = 100000
C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(N)
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(N)
self.df.to_pickle(self.fname)
def time_read_pickle(self):
diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py
index 8181f1d41ac70..7ce8ef8c12639 100644
--- a/asv_bench/benchmarks/io/sas.py
+++ b/asv_bench/benchmarks/io/sas.py
@@ -5,15 +5,25 @@
class SAS:
- params = ['sas7bdat', 'xport']
- param_names = ['format']
+ params = ["sas7bdat", "xport"]
+ param_names = ["format"]
def setup(self, format):
# Read files that are located in 'pandas/io/tests/sas/data'
- files = {'sas7bdat': 'test1.sas7bdat', 'xport': 'paxraw_d_short.xpt'}
+ files = {"sas7bdat": "test1.sas7bdat", "xport": "paxraw_d_short.xpt"}
file = files[format]
- paths = [os.path.dirname(__file__), '..', '..', '..', 'pandas',
- 'tests', 'io', 'sas', 'data', file]
+ paths = [
+ os.path.dirname(__file__),
+ "..",
+ "..",
+ "..",
+ "pandas",
+ "tests",
+ "io",
+ "sas",
+ "data",
+ file,
+ ]
self.f = os.path.join(*paths)
def time_read_msgpack(self, format):
diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py
index ee48f3bd0a3ab..b80872b17a9e4 100644
--- a/asv_bench/benchmarks/io/sql.py
+++ b/asv_bench/benchmarks/io/sql.py
@@ -8,31 +8,35 @@
class SQL:
- params = ['sqlalchemy', 'sqlite']
- param_names = ['connection']
+ params = ["sqlalchemy", "sqlite"]
+ param_names = ["connection"]
def setup(self, connection):
N = 10000
- con = {'sqlalchemy': create_engine('sqlite:///:memory:'),
- 'sqlite': sqlite3.connect(':memory:')}
- self.table_name = 'test_type'
- self.query_all = 'SELECT * FROM {}'.format(self.table_name)
+ con = {
+ "sqlalchemy": create_engine("sqlite:///:memory:"),
+ "sqlite": sqlite3.connect(":memory:"),
+ }
+ self.table_name = "test_type"
+ self.query_all = "SELECT * FROM {}".format(self.table_name)
self.con = con[connection]
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_to_sql_dataframe(self, connection):
- self.df.to_sql('test1', self.con, if_exists='replace')
+ self.df.to_sql("test1", self.con, if_exists="replace")
def time_read_sql_query(self, connection):
read_sql_query(self.query_all, self.con)
@@ -40,85 +44,98 @@ def time_read_sql_query(self, connection):
class WriteSQLDtypes:
- params = (['sqlalchemy', 'sqlite'],
- ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'])
- param_names = ['connection', 'dtype']
+ params = (
+ ["sqlalchemy", "sqlite"],
+ ["float", "float_with_nan", "string", "bool", "int", "datetime"],
+ )
+ param_names = ["connection", "dtype"]
def setup(self, connection, dtype):
N = 10000
- con = {'sqlalchemy': create_engine('sqlite:///:memory:'),
- 'sqlite': sqlite3.connect(':memory:')}
- self.table_name = 'test_type'
- self.query_col = 'SELECT {} FROM {}'.format(dtype, self.table_name)
+ con = {
+ "sqlalchemy": create_engine("sqlite:///:memory:"),
+ "sqlite": sqlite3.connect(":memory:"),
+ }
+ self.table_name = "test_type"
+ self.query_col = "SELECT {} FROM {}".format(dtype, self.table_name)
self.con = con[connection]
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_to_sql_dataframe_column(self, connection, dtype):
- self.df[[dtype]].to_sql('test1', self.con, if_exists='replace')
+ self.df[[dtype]].to_sql("test1", self.con, if_exists="replace")
def time_read_sql_query_select_column(self, connection, dtype):
read_sql_query(self.query_col, self.con)
class ReadSQLTable:
-
def setup(self):
N = 10000
- self.table_name = 'test'
- self.con = create_engine('sqlite:///:memory:')
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.table_name = "test"
+ self.con = create_engine("sqlite:///:memory:")
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_read_sql_table_all(self):
read_sql_table(self.table_name, self.con)
def time_read_sql_table_parse_dates(self):
- read_sql_table(self.table_name, self.con, columns=['datetime_string'],
- parse_dates=['datetime_string'])
+ read_sql_table(
+ self.table_name,
+ self.con,
+ columns=["datetime_string"],
+ parse_dates=["datetime_string"],
+ )
class ReadSQLTableDtypes:
- params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']
- param_names = ['dtype']
+ params = ["float", "float_with_nan", "string", "bool", "int", "datetime"]
+ param_names = ["dtype"]
def setup(self, dtype):
N = 10000
- self.table_name = 'test'
- self.con = create_engine('sqlite:///:memory:')
- self.df = DataFrame({'float': np.random.randn(N),
- 'float_with_nan': np.random.randn(N),
- 'string': ['foo'] * N,
- 'bool': [True] * N,
- 'int': np.random.randint(0, N, size=N),
- 'datetime': date_range('2000-01-01',
- periods=N,
- freq='s')},
- index=tm.makeStringIndex(N))
- self.df.loc[1000:3000, 'float_with_nan'] = np.nan
- self.df['datetime_string'] = self.df['datetime'].astype(str)
- self.df.to_sql(self.table_name, self.con, if_exists='replace')
+ self.table_name = "test"
+ self.con = create_engine("sqlite:///:memory:")
+ self.df = DataFrame(
+ {
+ "float": np.random.randn(N),
+ "float_with_nan": np.random.randn(N),
+ "string": ["foo"] * N,
+ "bool": [True] * N,
+ "int": np.random.randint(0, N, size=N),
+ "datetime": date_range("2000-01-01", periods=N, freq="s"),
+ },
+ index=tm.makeStringIndex(N),
+ )
+ self.df.loc[1000:3000, "float_with_nan"] = np.nan
+ self.df["datetime_string"] = self.df["datetime"].astype(str)
+ self.df.to_sql(self.table_name, self.con, if_exists="replace")
def time_read_sql_table_column(self, dtype):
read_sql_table(self.table_name, self.con, columns=[dtype])
diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py
index fff10cf10a4d3..b3ed71af47dc8 100644
--- a/asv_bench/benchmarks/io/stata.py
+++ b/asv_bench/benchmarks/io/stata.py
@@ -7,26 +7,30 @@
class Stata(BaseIO):
- params = ['tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty']
- param_names = ['convert_dates']
+ params = ["tc", "td", "tm", "tw", "th", "tq", "ty"]
+ param_names = ["convert_dates"]
def setup(self, convert_dates):
- self.fname = '__test__.dta'
+ self.fname = "__test__.dta"
N = self.N = 100000
C = self.C = 5
- self.df = DataFrame(np.random.randn(N, C),
- columns=['float{}'.format(i) for i in range(C)],
- index=date_range('20000101', periods=N, freq='H'))
- self.df['object'] = tm.makeStringIndex(self.N)
- self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min,
- np.iinfo(np.int8).max - 27, N)
- self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min,
- np.iinfo(np.int16).max - 27, N)
- self.df['int32_'] = np.random.randint(np.iinfo(np.int32).min,
- np.iinfo(np.int32).max - 27, N)
- self.df['float32_'] = np.array(np.random.randn(N),
- dtype=np.float32)
- self.convert_dates = {'index': convert_dates}
+ self.df = DataFrame(
+ np.random.randn(N, C),
+ columns=["float{}".format(i) for i in range(C)],
+ index=date_range("20000101", periods=N, freq="H"),
+ )
+ self.df["object"] = tm.makeStringIndex(self.N)
+ self.df["int8_"] = np.random.randint(
+ np.iinfo(np.int8).min, np.iinfo(np.int8).max - 27, N
+ )
+ self.df["int16_"] = np.random.randint(
+ np.iinfo(np.int16).min, np.iinfo(np.int16).max - 27, N
+ )
+ self.df["int32_"] = np.random.randint(
+ np.iinfo(np.int32).min, np.iinfo(np.int32).max - 27, N
+ )
+ self.df["float32_"] = np.array(np.random.randn(N), dtype=np.float32)
+ self.convert_dates = {"index": convert_dates}
self.df.to_stata(self.fname, self.convert_dates)
def time_read_stata(self, convert_dates):
@@ -42,7 +46,7 @@ def setup(self, convert_dates):
for i in range(10):
missing_data = np.random.randn(self.N)
missing_data[missing_data < 0] = np.nan
- self.df['missing_{0}'.format(i)] = missing_data
+ self.df["missing_{0}".format(i)] = missing_data
self.df.to_stata(self.fname, self.convert_dates)
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
index bbaba9909966e..7c899e3dc6ac8 100644
--- a/asv_bench/benchmarks/join_merge.py
+++ b/asv_bench/benchmarks/join_merge.py
@@ -2,8 +2,7 @@
import numpy as np
import pandas.util.testing as tm
-from pandas import (DataFrame, Series, MultiIndex,
- date_range, concat, merge, merge_asof)
+from pandas import DataFrame, Series, MultiIndex, date_range, concat, merge, merge_asof
try:
from pandas import merge_ordered
@@ -12,16 +11,14 @@
class Append:
-
def setup(self):
- self.df1 = DataFrame(np.random.randn(10000, 4),
- columns=['A', 'B', 'C', 'D'])
+ self.df1 = DataFrame(np.random.randn(10000, 4), columns=["A", "B", "C", "D"])
self.df2 = self.df1.copy()
self.df2.index = np.arange(10000, 20000)
self.mdf1 = self.df1.copy()
- self.mdf1['obj1'] = 'bar'
- self.mdf1['obj2'] = 'bar'
- self.mdf1['int1'] = 5
+ self.mdf1["obj1"] = "bar"
+ self.mdf1["obj2"] = "bar"
+ self.mdf1["int1"] = 5
self.mdf1 = self.mdf1._consolidate()
self.mdf2 = self.mdf1.copy()
self.mdf2.index = self.df2.index
@@ -36,15 +33,16 @@ def time_append_mixed(self):
class Concat:
params = [0, 1]
- param_names = ['axis']
+ param_names = ["axis"]
def setup(self, axis):
N = 1000
s = Series(N, index=tm.makeStringIndex(N))
- self.series = [s[i:- i] for i in range(1, 10)] * 50
+ self.series = [s[i:-i] for i in range(1, 10)] * 50
self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000
- df = DataFrame({'A': range(N)},
- index=date_range('20130101', periods=N, freq='s'))
+ df = DataFrame(
+ {"A": range(N)}, index=date_range("20130101", periods=N, freq="s")
+ )
self.empty_left = [DataFrame(), df]
self.empty_right = [df, DataFrame()]
self.mixed_ndims = [df, df.head(N // 2)]
@@ -68,14 +66,12 @@ def time_concat_mixed_ndims(self, axis):
class ConcatDataFrames:
params = ([0, 1], [True, False])
- param_names = ['axis', 'ignore_index']
+ param_names = ["axis", "ignore_index"]
def setup(self, axis, ignore_index):
- frame_c = DataFrame(np.zeros((10000, 200),
- dtype=np.float32, order='C'))
+ frame_c = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="C"))
self.frame_c = [frame_c] * 20
- frame_f = DataFrame(np.zeros((10000, 200),
- dtype=np.float32, order='F'))
+ frame_f = DataFrame(np.zeros((10000, 200), dtype=np.float32, order="F"))
self.frame_f = [frame_f] * 20
def time_c_ordered(self, axis, ignore_index):
@@ -88,74 +84,78 @@ def time_f_ordered(self, axis, ignore_index):
class Join:
params = [True, False]
- param_names = ['sort']
+ param_names = ["sort"]
def setup(self, sort):
level1 = tm.makeStringIndex(10).values
level2 = tm.makeStringIndex(1000).values
codes1 = np.arange(10).repeat(1000)
codes2 = np.tile(np.arange(1000), 10)
- index2 = MultiIndex(levels=[level1, level2],
- codes=[codes1, codes2])
- self.df_multi = DataFrame(np.random.randn(len(index2), 4),
- index=index2,
- columns=['A', 'B', 'C', 'D'])
+ index2 = MultiIndex(levels=[level1, level2], codes=[codes1, codes2])
+ self.df_multi = DataFrame(
+ np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"]
+ )
self.key1 = np.tile(level1.take(codes1), 10)
self.key2 = np.tile(level2.take(codes2), 10)
- self.df = DataFrame({'data1': np.random.randn(100000),
- 'data2': np.random.randn(100000),
- 'key1': self.key1,
- 'key2': self.key2})
-
- self.df_key1 = DataFrame(np.random.randn(len(level1), 4),
- index=level1,
- columns=['A', 'B', 'C', 'D'])
- self.df_key2 = DataFrame(np.random.randn(len(level2), 4),
- index=level2,
- columns=['A', 'B', 'C', 'D'])
+ self.df = DataFrame(
+ {
+ "data1": np.random.randn(100000),
+ "data2": np.random.randn(100000),
+ "key1": self.key1,
+ "key2": self.key2,
+ }
+ )
+
+ self.df_key1 = DataFrame(
+ np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"]
+ )
+ self.df_key2 = DataFrame(
+ np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"]
+ )
shuf = np.arange(100000)
np.random.shuffle(shuf)
self.df_shuf = self.df.reindex(self.df.index[shuf])
def time_join_dataframe_index_multi(self, sort):
- self.df.join(self.df_multi, on=['key1', 'key2'], sort=sort)
+ self.df.join(self.df_multi, on=["key1", "key2"], sort=sort)
def time_join_dataframe_index_single_key_bigger(self, sort):
- self.df.join(self.df_key2, on='key2', sort=sort)
+ self.df.join(self.df_key2, on="key2", sort=sort)
def time_join_dataframe_index_single_key_small(self, sort):
- self.df.join(self.df_key1, on='key1', sort=sort)
+ self.df.join(self.df_key1, on="key1", sort=sort)
def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort):
- self.df_shuf.join(self.df_key2, on='key2', sort=sort)
+ self.df_shuf.join(self.df_key2, on="key2", sort=sort)
class JoinIndex:
-
def setup(self):
N = 50000
- self.left = DataFrame(np.random.randint(1, N / 500, (N, 2)),
- columns=['jim', 'joe'])
- self.right = DataFrame(np.random.randint(1, N / 500, (N, 2)),
- columns=['jolie', 'jolia']).set_index('jolie')
+ self.left = DataFrame(
+ np.random.randint(1, N / 500, (N, 2)), columns=["jim", "joe"]
+ )
+ self.right = DataFrame(
+ np.random.randint(1, N / 500, (N, 2)), columns=["jolie", "jolia"]
+ ).set_index("jolie")
def time_left_outer_join_index(self):
- self.left.join(self.right, on='jim')
+ self.left.join(self.right, on="jim")
class JoinNonUnique:
# outer join of non-unique
# GH 6329
def setup(self):
- date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T')
- daily_dates = date_index.to_period('D').to_timestamp('S', 'S')
+ date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="T")
+ daily_dates = date_index.to_period("D").to_timestamp("S", "S")
self.fracofday = date_index.values - daily_dates.values
- self.fracofday = self.fracofday.astype('timedelta64[ns]')
+ self.fracofday = self.fracofday.astype("timedelta64[ns]")
self.fracofday = self.fracofday.astype(np.float64) / 86400000000000.0
self.fracofday = Series(self.fracofday, daily_dates)
- index = date_range(date_index.min(), date_index.max(), freq='D')
+ index = date_range(date_index.min(), date_index.max(), freq="D")
self.temp = Series(1.0, index)[self.fracofday.index]
def time_join_non_unique_equal(self):
@@ -165,7 +165,7 @@ def time_join_non_unique_equal(self):
class Merge:
params = [True, False]
- param_names = ['sort']
+ param_names = ["sort"]
def setup(self, sort):
N = 10000
@@ -173,17 +173,25 @@ def setup(self, sort):
indices2 = tm.makeStringIndex(N).values
key = np.tile(indices[:8000], 10)
key2 = np.tile(indices2[:8000], 10)
- self.left = DataFrame({'key': key, 'key2': key2,
- 'value': np.random.randn(80000)})
- self.right = DataFrame({'key': indices[2000:],
- 'key2': indices2[2000:],
- 'value2': np.random.randn(8000)})
-
- self.df = DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2),
- 'key2': np.tile(np.arange(250).repeat(10), 4),
- 'value': np.random.randn(10000)})
- self.df2 = DataFrame({'key1': np.arange(500),
- 'value2': np.random.randn(500)})
+ self.left = DataFrame(
+ {"key": key, "key2": key2, "value": np.random.randn(80000)}
+ )
+ self.right = DataFrame(
+ {
+ "key": indices[2000:],
+ "key2": indices2[2000:],
+ "value2": np.random.randn(8000),
+ }
+ )
+
+ self.df = DataFrame(
+ {
+ "key1": np.tile(np.arange(500).repeat(10), 2),
+ "key2": np.tile(np.arange(250).repeat(10), 4),
+ "value": np.random.randn(10000),
+ }
+ )
+ self.df2 = DataFrame({"key1": np.arange(500), "value2": np.random.randn(500)})
self.df3 = self.df[:5000]
def time_merge_2intkey(self, sort):
@@ -193,125 +201,141 @@ def time_merge_dataframe_integer_2key(self, sort):
merge(self.df, self.df3, sort=sort)
def time_merge_dataframe_integer_key(self, sort):
- merge(self.df, self.df2, on='key1', sort=sort)
+ merge(self.df, self.df2, on="key1", sort=sort)
class I8Merge:
- params = ['inner', 'outer', 'left', 'right']
- param_names = ['how']
+ params = ["inner", "outer", "left", "right"]
+ param_names = ["how"]
def setup(self, how):
- low, high, n = -1000, 1000, 10**6
- self.left = DataFrame(np.random.randint(low, high, (n, 7)),
- columns=list('ABCDEFG'))
- self.left['left'] = self.left.sum(axis=1)
- self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1)
+ low, high, n = -1000, 1000, 10 ** 6
+ self.left = DataFrame(
+ np.random.randint(low, high, (n, 7)), columns=list("ABCDEFG")
+ )
+ self.left["left"] = self.left.sum(axis=1)
+ self.right = self.left.sample(frac=1).rename({"left": "right"}, axis=1)
self.right = self.right.reset_index(drop=True)
- self.right['right'] *= -1
+ self.right["right"] *= -1
def time_i8merge(self, how):
merge(self.left, self.right, how=how)
class MergeCategoricals:
-
def setup(self):
self.left_object = DataFrame(
- {'X': np.random.choice(range(0, 10), size=(10000,)),
- 'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})
+ {
+ "X": np.random.choice(range(0, 10), size=(10000,)),
+ "Y": np.random.choice(["one", "two", "three"], size=(10000,)),
+ }
+ )
self.right_object = DataFrame(
- {'X': np.random.choice(range(0, 10), size=(10000,)),
- 'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})
+ {
+ "X": np.random.choice(range(0, 10), size=(10000,)),
+ "Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)),
+ }
+ )
self.left_cat = self.left_object.assign(
- Y=self.left_object['Y'].astype('category'))
+ Y=self.left_object["Y"].astype("category")
+ )
self.right_cat = self.right_object.assign(
- Z=self.right_object['Z'].astype('category'))
+ Z=self.right_object["Z"].astype("category")
+ )
def time_merge_object(self):
- merge(self.left_object, self.right_object, on='X')
+ merge(self.left_object, self.right_object, on="X")
def time_merge_cat(self):
- merge(self.left_cat, self.right_cat, on='X')
+ merge(self.left_cat, self.right_cat, on="X")
class MergeOrdered:
-
def setup(self):
groups = tm.makeStringIndex(10).values
- self.left = DataFrame({'group': groups.repeat(5000),
- 'key': np.tile(np.arange(0, 10000, 2), 10),
- 'lvalue': np.random.randn(50000)})
- self.right = DataFrame({'key': np.arange(10000),
- 'rvalue': np.random.randn(10000)})
+ self.left = DataFrame(
+ {
+ "group": groups.repeat(5000),
+ "key": np.tile(np.arange(0, 10000, 2), 10),
+ "lvalue": np.random.randn(50000),
+ }
+ )
+ self.right = DataFrame(
+ {"key": np.arange(10000), "rvalue": np.random.randn(10000)}
+ )
def time_merge_ordered(self):
- merge_ordered(self.left, self.right, on='key', left_by='group')
+ merge_ordered(self.left, self.right, on="key", left_by="group")
class MergeAsof:
- params = [['backward', 'forward', 'nearest']]
- param_names = ['direction']
+ params = [["backward", "forward", "nearest"]]
+ param_names = ["direction"]
def setup(self, direction):
one_count = 200000
two_count = 1000000
df1 = DataFrame(
- {'time': np.random.randint(0, one_count / 20, one_count),
- 'key': np.random.choice(list(string.ascii_uppercase), one_count),
- 'key2': np.random.randint(0, 25, one_count),
- 'value1': np.random.randn(one_count)})
+ {
+ "time": np.random.randint(0, one_count / 20, one_count),
+ "key": np.random.choice(list(string.ascii_uppercase), one_count),
+ "key2": np.random.randint(0, 25, one_count),
+ "value1": np.random.randn(one_count),
+ }
+ )
df2 = DataFrame(
- {'time': np.random.randint(0, two_count / 20, two_count),
- 'key': np.random.choice(list(string.ascii_uppercase), two_count),
- 'key2': np.random.randint(0, 25, two_count),
- 'value2': np.random.randn(two_count)})
-
- df1 = df1.sort_values('time')
- df2 = df2.sort_values('time')
-
- df1['time32'] = np.int32(df1.time)
- df2['time32'] = np.int32(df2.time)
-
- self.df1a = df1[['time', 'value1']]
- self.df2a = df2[['time', 'value2']]
- self.df1b = df1[['time', 'key', 'value1']]
- self.df2b = df2[['time', 'key', 'value2']]
- self.df1c = df1[['time', 'key2', 'value1']]
- self.df2c = df2[['time', 'key2', 'value2']]
- self.df1d = df1[['time32', 'value1']]
- self.df2d = df2[['time32', 'value2']]
- self.df1e = df1[['time', 'key', 'key2', 'value1']]
- self.df2e = df2[['time', 'key', 'key2', 'value2']]
+ {
+ "time": np.random.randint(0, two_count / 20, two_count),
+ "key": np.random.choice(list(string.ascii_uppercase), two_count),
+ "key2": np.random.randint(0, 25, two_count),
+ "value2": np.random.randn(two_count),
+ }
+ )
+
+ df1 = df1.sort_values("time")
+ df2 = df2.sort_values("time")
+
+ df1["time32"] = np.int32(df1.time)
+ df2["time32"] = np.int32(df2.time)
+
+ self.df1a = df1[["time", "value1"]]
+ self.df2a = df2[["time", "value2"]]
+ self.df1b = df1[["time", "key", "value1"]]
+ self.df2b = df2[["time", "key", "value2"]]
+ self.df1c = df1[["time", "key2", "value1"]]
+ self.df2c = df2[["time", "key2", "value2"]]
+ self.df1d = df1[["time32", "value1"]]
+ self.df2d = df2[["time32", "value2"]]
+ self.df1e = df1[["time", "key", "key2", "value1"]]
+ self.df2e = df2[["time", "key", "key2", "value2"]]
def time_on_int(self, direction):
- merge_asof(self.df1a, self.df2a, on='time', direction=direction)
+ merge_asof(self.df1a, self.df2a, on="time", direction=direction)
def time_on_int32(self, direction):
- merge_asof(self.df1d, self.df2d, on='time32', direction=direction)
+ merge_asof(self.df1d, self.df2d, on="time32", direction=direction)
def time_by_object(self, direction):
- merge_asof(self.df1b, self.df2b, on='time', by='key',
- direction=direction)
+ merge_asof(self.df1b, self.df2b, on="time", by="key", direction=direction)
def time_by_int(self, direction):
- merge_asof(self.df1c, self.df2c, on='time', by='key2',
- direction=direction)
+ merge_asof(self.df1c, self.df2c, on="time", by="key2", direction=direction)
def time_multiby(self, direction):
- merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'],
- direction=direction)
+ merge_asof(
+ self.df1e, self.df2e, on="time", by=["key", "key2"], direction=direction
+ )
class Align:
-
def setup(self):
- size = 5 * 10**5
- rng = np.arange(0, 10**13, 10**7)
- stamps = np.datetime64('now').view('i8') + rng
+ size = 5 * 10 ** 5
+ rng = np.arange(0, 10 ** 13, 10 ** 7)
+ stamps = np.datetime64("now").view("i8") + rng
idx1 = np.sort(np.random.choice(stamps, size, replace=False))
idx2 = np.sort(np.random.choice(stamps, size, replace=False))
self.ts1 = Series(np.random.randn(size), idx1)
@@ -321,7 +345,7 @@ def time_series_align_int64_index(self):
self.ts1 + self.ts2
def time_series_align_left_monotonic(self):
- self.ts1.align(self.ts2, join='left')
+ self.ts1.align(self.ts2, join="left")
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
index c979ba6d53a08..eda059a68e8a5 100644
--- a/asv_bench/benchmarks/multiindex_object.py
+++ b/asv_bench/benchmarks/multiindex_object.py
@@ -6,46 +6,44 @@
class GetLoc:
-
def setup(self):
self.mi_large = MultiIndex.from_product(
[np.arange(1000), np.arange(20), list(string.ascii_letters)],
- names=['one', 'two', 'three'])
+ names=["one", "two", "three"],
+ )
self.mi_med = MultiIndex.from_product(
- [np.arange(1000), np.arange(10), list('A')],
- names=['one', 'two', 'three'])
+ [np.arange(1000), np.arange(10), list("A")], names=["one", "two", "three"]
+ )
self.mi_small = MultiIndex.from_product(
- [np.arange(100), list('A'), list('A')],
- names=['one', 'two', 'three'])
+ [np.arange(100), list("A"), list("A")], names=["one", "two", "three"]
+ )
def time_large_get_loc(self):
- self.mi_large.get_loc((999, 19, 'Z'))
+ self.mi_large.get_loc((999, 19, "Z"))
def time_large_get_loc_warm(self):
for _ in range(1000):
- self.mi_large.get_loc((999, 19, 'Z'))
+ self.mi_large.get_loc((999, 19, "Z"))
def time_med_get_loc(self):
- self.mi_med.get_loc((999, 9, 'A'))
+ self.mi_med.get_loc((999, 9, "A"))
def time_med_get_loc_warm(self):
for _ in range(1000):
- self.mi_med.get_loc((999, 9, 'A'))
+ self.mi_med.get_loc((999, 9, "A"))
def time_string_get_loc(self):
- self.mi_small.get_loc((99, 'A', 'A'))
+ self.mi_small.get_loc((99, "A", "A"))
def time_small_get_loc_warm(self):
for _ in range(1000):
- self.mi_small.get_loc((99, 'A', 'A'))
+ self.mi_small.get_loc((99, "A", "A"))
class Duplicates:
-
def setup(self):
size = 65536
- arrays = [np.random.randint(0, 8192, size),
- np.random.randint(0, 1024, size)]
+ arrays = [np.random.randint(0, 8192, size), np.random.randint(0, 1024, size)]
mask = np.random.rand(size) < 0.1
self.mi_unused_levels = MultiIndex.from_arrays(arrays)
self.mi_unused_levels = self.mi_unused_levels[mask]
@@ -55,15 +53,25 @@ def time_remove_unused_levels(self):
class Integer:
-
def setup(self):
- self.mi_int = MultiIndex.from_product([np.arange(1000),
- np.arange(1000)],
- names=['one', 'two'])
- self.obj_index = np.array([(0, 10), (0, 11), (0, 12),
- (0, 13), (0, 14), (0, 15),
- (0, 16), (0, 17), (0, 18),
- (0, 19)], dtype=object)
+ self.mi_int = MultiIndex.from_product(
+ [np.arange(1000), np.arange(1000)], names=["one", "two"]
+ )
+ self.obj_index = np.array(
+ [
+ (0, 10),
+ (0, 11),
+ (0, 12),
+ (0, 13),
+ (0, 14),
+ (0, 15),
+ (0, 16),
+ (0, 17),
+ (0, 18),
+ (0, 19),
+ ],
+ dtype=object,
+ )
def time_get_indexer(self):
self.mi_int.get_indexer(self.obj_index)
@@ -73,12 +81,9 @@ def time_is_monotonic(self):
class Duplicated:
-
def setup(self):
n, k = 200, 5000
- levels = [np.arange(n),
- tm.makeStringIndex(n).values,
- 1000 + np.arange(n)]
+ levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)]
codes = [np.random.choice(n, (k * n)) for lev in levels]
self.mi = MultiIndex(levels=levels, codes=codes)
@@ -87,12 +92,13 @@ def time_duplicated(self):
class Sortlevel:
-
def setup(self):
n = 1182720
low, high = -4096, 4096
- arrs = [np.repeat(np.random.randint(low, high, (n // k)), k)
- for k in [11, 7, 5, 3, 1]]
+ arrs = [
+ np.repeat(np.random.randint(low, high, (n // k)), k)
+ for k in [11, 7, 5, 3, 1]
+ ]
self.mi_int = MultiIndex.from_arrays(arrs)[np.random.permutation(n)]
a = np.repeat(np.arange(100), 1000)
@@ -111,11 +117,10 @@ def time_sortlevel_one(self):
class Values:
-
def setup_cache(self):
level1 = range(1000)
- level2 = date_range(start='1/1/2012', periods=100)
+ level2 = date_range(start="1/1/2012", periods=100)
mi = MultiIndex.from_product([level1, level2])
return mi
@@ -127,17 +132,18 @@ def time_datetime_level_values_sliced(self, mi):
class CategoricalLevel:
-
def setup(self):
- self.df = DataFrame({
- 'a': np.arange(1_000_000, dtype=np.int32),
- 'b': np.arange(1_000_000, dtype=np.int64),
- 'c': np.arange(1_000_000, dtype=float),
- }).astype({'a': 'category', 'b': 'category'})
+ self.df = DataFrame(
+ {
+ "a": np.arange(1_000_000, dtype=np.int32),
+ "b": np.arange(1_000_000, dtype=np.int64),
+ "c": np.arange(1_000_000, dtype=float),
+ }
+ ).astype({"a": "category", "b": "category"})
def time_categorical_level(self):
- self.df.set_index(['a', 'b'])
+ self.df.set_index(["a", "b"])
from .pandas_vb_common import setup # noqa: F401
diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py
index 9b738e699a5b3..31c3b6fb6cb60 100644
--- a/asv_bench/benchmarks/offset.py
+++ b/asv_bench/benchmarks/offset.py
@@ -3,6 +3,7 @@
import numpy as np
import pandas as pd
+
try:
import pandas.tseries.holiday # noqa
except ImportError:
@@ -10,35 +11,43 @@
hcal = pd.tseries.holiday.USFederalHolidayCalendar()
# These offsets currently raise a NotImplimentedError with .apply_index()
-non_apply = [pd.offsets.Day(),
- pd.offsets.BYearEnd(),
- pd.offsets.BYearBegin(),
- pd.offsets.BQuarterEnd(),
- pd.offsets.BQuarterBegin(),
- pd.offsets.BMonthEnd(),
- pd.offsets.BMonthBegin(),
- pd.offsets.CustomBusinessDay(),
- pd.offsets.CustomBusinessDay(calendar=hcal),
- pd.offsets.CustomBusinessMonthBegin(calendar=hcal),
- pd.offsets.CustomBusinessMonthEnd(calendar=hcal),
- pd.offsets.CustomBusinessMonthEnd(calendar=hcal)]
-other_offsets = [pd.offsets.YearEnd(), pd.offsets.YearBegin(),
- pd.offsets.QuarterEnd(), pd.offsets.QuarterBegin(),
- pd.offsets.MonthEnd(), pd.offsets.MonthBegin(),
- pd.offsets.DateOffset(months=2, days=2),
- pd.offsets.BusinessDay(), pd.offsets.SemiMonthEnd(),
- pd.offsets.SemiMonthBegin()]
+non_apply = [
+ pd.offsets.Day(),
+ pd.offsets.BYearEnd(),
+ pd.offsets.BYearBegin(),
+ pd.offsets.BQuarterEnd(),
+ pd.offsets.BQuarterBegin(),
+ pd.offsets.BMonthEnd(),
+ pd.offsets.BMonthBegin(),
+ pd.offsets.CustomBusinessDay(),
+ pd.offsets.CustomBusinessDay(calendar=hcal),
+ pd.offsets.CustomBusinessMonthBegin(calendar=hcal),
+ pd.offsets.CustomBusinessMonthEnd(calendar=hcal),
+ pd.offsets.CustomBusinessMonthEnd(calendar=hcal),
+]
+other_offsets = [
+ pd.offsets.YearEnd(),
+ pd.offsets.YearBegin(),
+ pd.offsets.QuarterEnd(),
+ pd.offsets.QuarterBegin(),
+ pd.offsets.MonthEnd(),
+ pd.offsets.MonthBegin(),
+ pd.offsets.DateOffset(months=2, days=2),
+ pd.offsets.BusinessDay(),
+ pd.offsets.SemiMonthEnd(),
+ pd.offsets.SemiMonthBegin(),
+]
offsets = non_apply + other_offsets
class ApplyIndex:
params = other_offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
N = 10000
- self.rng = pd.date_range(start='1/1/2000', periods=N, freq='T')
+ self.rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
def time_apply_index(self, offset):
offset.apply_index(self.rng)
@@ -47,13 +56,15 @@ def time_apply_index(self, offset):
class OnOffset:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
- self.dates = [datetime(2016, m, d)
- for m in [10, 11, 12]
- for d in [1, 2, 3, 28, 29, 30, 31]
- if not (m == 11 and d == 31)]
+ self.dates = [
+ datetime(2016, m, d)
+ for m in [10, 11, 12]
+ for d in [1, 2, 3, 28, 29, 30, 31]
+ if not (m == 11 and d == 31)
+ ]
def time_on_offset(self, offset):
for date in self.dates:
@@ -63,11 +74,11 @@ def time_on_offset(self, offset):
class OffsetSeriesArithmetic:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
N = 1000
- rng = pd.date_range(start='1/1/2000', periods=N, freq='T')
+ rng = pd.date_range(start="1/1/2000", periods=N, freq="T")
self.data = pd.Series(rng)
def time_add_offset(self, offset):
@@ -78,11 +89,11 @@ def time_add_offset(self, offset):
class OffsetDatetimeIndexArithmetic:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
N = 1000
- self.data = pd.date_range(start='1/1/2000', periods=N, freq='T')
+ self.data = pd.date_range(start="1/1/2000", periods=N, freq="T")
def time_add_offset(self, offset):
with warnings.catch_warnings(record=True):
@@ -92,11 +103,11 @@ def time_add_offset(self, offset):
class OffestDatetimeArithmetic:
params = offsets
- param_names = ['offset']
+ param_names = ["offset"]
def setup(self, offset):
self.date = datetime(2011, 1, 1)
- self.dt64 = np.datetime64('2011-01-01 09:00Z')
+ self.dt64 = np.datetime64("2011-01-01 09:00Z")
def time_apply(self, offset):
offset.apply(self.date)
diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py
index 59b1638920666..fdc8207021c0f 100644
--- a/asv_bench/benchmarks/pandas_vb_common.py
+++ b/asv_bench/benchmarks/pandas_vb_common.py
@@ -5,26 +5,42 @@
import pandas as pd
# Compatibility import for lib
-for imp in ['pandas._libs.lib', 'pandas.lib']:
+for imp in ["pandas._libs.lib", "pandas.lib"]:
try:
lib = import_module(imp)
break
except (ImportError, TypeError, ValueError):
pass
-numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32,
- np.float64, np.int16, np.int8, np.uint16, np.uint8]
+numeric_dtypes = [
+ np.int64,
+ np.int32,
+ np.uint32,
+ np.uint64,
+ np.float32,
+ np.float64,
+ np.int16,
+ np.int8,
+ np.uint16,
+ np.uint8,
+]
datetime_dtypes = [np.datetime64, np.timedelta64]
string_dtypes = [np.object]
try:
- extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype,
- pd.Int32Dtype, pd.Int64Dtype,
- pd.UInt8Dtype, pd.UInt16Dtype,
- pd.UInt32Dtype, pd.UInt64Dtype,
- pd.CategoricalDtype,
- pd.IntervalDtype,
- pd.DatetimeTZDtype('ns', 'UTC'),
- pd.PeriodDtype('D')]
+ extension_dtypes = [
+ pd.Int8Dtype,
+ pd.Int16Dtype,
+ pd.Int32Dtype,
+ pd.Int64Dtype,
+ pd.UInt8Dtype,
+ pd.UInt16Dtype,
+ pd.UInt32Dtype,
+ pd.UInt64Dtype,
+ pd.CategoricalDtype,
+ pd.IntervalDtype,
+ pd.DatetimeTZDtype("ns", "UTC"),
+ pd.PeriodDtype("D"),
+ ]
except AttributeError:
extension_dtypes = []
@@ -40,6 +56,7 @@ class BaseIO:
"""
Base class for IO benchmarks
"""
+
fname = None
def remove(self, f):
diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py
index c8ba6c382cb64..2f8ae0650ab75 100644
--- a/asv_bench/benchmarks/period.py
+++ b/asv_bench/benchmarks/period.py
@@ -1,18 +1,33 @@
-from pandas import (
- DataFrame, Period, PeriodIndex, Series, date_range, period_range)
+from pandas import DataFrame, Period, PeriodIndex, Series, date_range, period_range
from pandas.tseries.frequencies import to_offset
class PeriodProperties:
- params = (['M', 'min'],
- ['year', 'month', 'day', 'hour', 'minute', 'second',
- 'is_leap_year', 'quarter', 'qyear', 'week', 'daysinmonth',
- 'dayofweek', 'dayofyear', 'start_time', 'end_time'])
- param_names = ['freq', 'attr']
+ params = (
+ ["M", "min"],
+ [
+ "year",
+ "month",
+ "day",
+ "hour",
+ "minute",
+ "second",
+ "is_leap_year",
+ "quarter",
+ "qyear",
+ "week",
+ "daysinmonth",
+ "dayofweek",
+ "dayofyear",
+ "start_time",
+ "end_time",
+ ],
+ )
+ param_names = ["freq", "attr"]
def setup(self, freq, attr):
- self.per = Period('2012-06-01', freq=freq)
+ self.per = Period("2012-06-01", freq=freq)
def time_property(self, freq, attr):
getattr(self.per, attr)
@@ -20,11 +35,11 @@ def time_property(self, freq, attr):
class PeriodUnaryMethods:
- params = ['M', 'min']
- param_names = ['freq']
+ params = ["M", "min"]
+ param_names = ["freq"]
def setup(self, freq):
- self.per = Period('2012-06-01', freq=freq)
+ self.per = Period("2012-06-01", freq=freq)
def time_to_timestamp(self, freq):
self.per.to_timestamp()
@@ -33,12 +48,12 @@ def time_now(self, freq):
self.per.now(freq)
def time_asfreq(self, freq):
- self.per.asfreq('A')
+ self.per.asfreq("A")
class PeriodConstructor:
- params = [['D'], [True, False]]
- param_names = ['freq', 'is_offset']
+ params = [["D"], [True, False]]
+ param_names = ["freq", "is_offset"]
def setup(self, freq, is_offset):
if is_offset:
@@ -47,20 +62,21 @@ def setup(self, freq, is_offset):
self.freq = freq
def time_period_constructor(self, freq, is_offset):
- Period('2012-06-01', freq=freq)
+ Period("2012-06-01", freq=freq)
class PeriodIndexConstructor:
- params = [['D'], [True, False]]
- param_names = ['freq', 'is_offset']
+ params = [["D"], [True, False]]
+ param_names = ["freq", "is_offset"]
def setup(self, freq, is_offset):
- self.rng = date_range('1985', periods=1000)
- self.rng2 = date_range('1985', periods=1000).to_pydatetime()
+ self.rng = date_range("1985", periods=1000)
+ self.rng2 = date_range("1985", periods=1000).to_pydatetime()
self.ints = list(range(2000, 3000))
- self.daily_ints = date_range('1/1/2000', periods=1000,
- freq=freq).strftime('%Y%m%d').map(int)
+ self.daily_ints = (
+ date_range("1/1/2000", periods=1000, freq=freq).strftime("%Y%m%d").map(int)
+ )
if is_offset:
self.freq = to_offset(freq)
else:
@@ -80,32 +96,35 @@ def time_from_ints_daily(self, freq, is_offset):
class DataFramePeriodColumn:
-
def setup(self):
- self.rng = period_range(start='1/1/1990', freq='S', periods=20000)
+ self.rng = period_range(start="1/1/1990", freq="S", periods=20000)
self.df = DataFrame(index=range(len(self.rng)))
def time_setitem_period_column(self):
- self.df['col'] = self.rng
+ self.df["col"] = self.rng
def time_set_index(self):
# GH#21582 limited by comparisons of Period objects
- self.df['col2'] = self.rng
- self.df.set_index('col2', append=True)
+ self.df["col2"] = self.rng
+ self.df.set_index("col2", append=True)
class Algorithms:
- params = ['index', 'series']
- param_names = ['typ']
+ params = ["index", "series"]
+ param_names = ["typ"]
def setup(self, typ):
- data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'),
- Period('2011-03', freq='M'), Period('2011-04', freq='M')]
-
- if typ == 'index':
- self.vector = PeriodIndex(data * 1000, freq='M')
- elif typ == 'series':
+ data = [
+ Period("2011-01", freq="M"),
+ Period("2011-02", freq="M"),
+ Period("2011-03", freq="M"),
+ Period("2011-04", freq="M"),
+ ]
+
+ if typ == "index":
+ self.vector = PeriodIndex(data * 1000, freq="M")
+ elif typ == "series":
self.vector = Series(data * 1000)
def time_drop_duplicates(self, typ):
@@ -116,9 +135,8 @@ def time_value_counts(self, typ):
class Indexing:
-
def setup(self):
- self.index = period_range(start='1985', periods=1000, freq='D')
+ self.index = period_range(start="1985", periods=1000, freq="D")
self.series = Series(range(1000), index=self.index)
self.period = self.index[500]
@@ -135,7 +153,7 @@ def time_series_loc(self):
self.series.loc[self.period]
def time_align(self):
- DataFrame({'a': self.series, 'b': self.series[:500]})
+ DataFrame({"a": self.series, "b": self.series[:500]})
def time_intersection(self):
self.index[:750].intersection(self.index[250:])
diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py
index 9e3bc87c32987..4fb0876f05a0a 100644
--- a/asv_bench/benchmarks/plotting.py
+++ b/asv_bench/benchmarks/plotting.py
@@ -1,27 +1,29 @@
import numpy as np
from pandas import DataFrame, Series, DatetimeIndex, date_range
+
try:
from pandas.plotting import andrews_curves
except ImportError:
from pandas.tools.plotting import andrews_curves
import matplotlib
-matplotlib.use('Agg')
+
+matplotlib.use("Agg")
class SeriesPlotting:
- params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie']]
- param_names = ['kind']
+ params = [["line", "bar", "area", "barh", "hist", "kde", "pie"]]
+ param_names = ["kind"]
def setup(self, kind):
- if kind in ['bar', 'barh', 'pie']:
+ if kind in ["bar", "barh", "pie"]:
n = 100
- elif kind in ['kde']:
+ elif kind in ["kde"]:
n = 10000
else:
n = 1000000
self.s = Series(np.random.randn(n))
- if kind in ['area', 'pie']:
+ if kind in ["area", "pie"]:
self.s = self.s.abs()
def time_series_plot(self, kind):
@@ -29,41 +31,43 @@ def time_series_plot(self, kind):
class FramePlotting:
- params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie', 'scatter',
- 'hexbin']]
- param_names = ['kind']
+ params = [
+ ["line", "bar", "area", "barh", "hist", "kde", "pie", "scatter", "hexbin"]
+ ]
+ param_names = ["kind"]
def setup(self, kind):
- if kind in ['bar', 'barh', 'pie']:
+ if kind in ["bar", "barh", "pie"]:
n = 100
- elif kind in ['kde', 'scatter', 'hexbin']:
+ elif kind in ["kde", "scatter", "hexbin"]:
n = 10000
else:
n = 1000000
self.x = Series(np.random.randn(n))
self.y = Series(np.random.randn(n))
- if kind in ['area', 'pie']:
+ if kind in ["area", "pie"]:
self.x = self.x.abs()
self.y = self.y.abs()
- self.df = DataFrame({'x': self.x, 'y': self.y})
+ self.df = DataFrame({"x": self.x, "y": self.y})
def time_frame_plot(self, kind):
- self.df.plot(x='x', y='y', kind=kind)
+ self.df.plot(x="x", y="y", kind=kind)
class TimeseriesPlotting:
-
def setup(self):
N = 2000
M = 5
- idx = date_range('1/1/1975', periods=N)
+ idx = date_range("1/1/1975", periods=N)
self.df = DataFrame(np.random.randn(N, M), index=idx)
- idx_irregular = DatetimeIndex(np.concatenate((idx.values[0:10],
- idx.values[12:])))
- self.df2 = DataFrame(np.random.randn(len(idx_irregular), M),
- index=idx_irregular)
+ idx_irregular = DatetimeIndex(
+ np.concatenate((idx.values[0:10], idx.values[12:]))
+ )
+ self.df2 = DataFrame(
+ np.random.randn(len(idx_irregular), M), index=idx_irregular
+ )
def time_plot_regular(self):
self.df.plot()
@@ -79,12 +83,11 @@ def time_plot_table(self):
class Misc:
-
def setup(self):
N = 500
M = 10
self.df = DataFrame(np.random.randn(N, M))
- self.df['Name'] = ["A"] * N
+ self.df["Name"] = ["A"] * N
def time_plot_andrews_curves(self):
andrews_curves(self.df, "Name")
diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py
index a6ceb0e93a089..8d4c9ebaf3e89 100644
--- a/asv_bench/benchmarks/reindex.py
+++ b/asv_bench/benchmarks/reindex.py
@@ -1,20 +1,18 @@
import numpy as np
import pandas.util.testing as tm
-from pandas import (DataFrame, Series, MultiIndex, Index, date_range,
- period_range)
+from pandas import DataFrame, Series, MultiIndex, Index, date_range, period_range
from .pandas_vb_common import lib
class Reindex:
-
def setup(self):
- rng = date_range(start='1/1/1970', periods=10000, freq='1min')
- self.df = DataFrame(np.random.rand(10000, 10), index=rng,
- columns=range(10))
- self.df['foo'] = 'bar'
+ rng = date_range(start="1/1/1970", periods=10000, freq="1min")
+ self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10))
+ self.df["foo"] = "bar"
self.rng_subset = Index(rng[::2])
- self.df2 = DataFrame(index=range(10000),
- data=np.random.rand(10000, 30), columns=range(30))
+ self.df2 = DataFrame(
+ index=range(10000), data=np.random.rand(10000, 30), columns=range(30)
+ )
N = 5000
K = 200
level1 = tm.makeStringIndex(N).values.repeat(K)
@@ -35,12 +33,12 @@ def time_reindex_multiindex(self):
class ReindexMethod:
- params = [['pad', 'backfill'], [date_range, period_range]]
- param_names = ['method', 'constructor']
+ params = [["pad", "backfill"], [date_range, period_range]]
+ param_names = ["method", "constructor"]
def setup(self, method, constructor):
N = 100000
- self.idx = constructor('1/1/2000', periods=N, freq='1min')
+ self.idx = constructor("1/1/2000", periods=N, freq="1min")
self.ts = Series(np.random.randn(N), index=self.idx)[::2]
def time_reindex_method(self, method, constructor):
@@ -49,15 +47,15 @@ def time_reindex_method(self, method, constructor):
class Fillna:
- params = ['pad', 'backfill']
- param_names = ['method']
+ params = ["pad", "backfill"]
+ param_names = ["method"]
def setup(self, method):
N = 100000
- self.idx = date_range('1/1/2000', periods=N, freq='1min')
+ self.idx = date_range("1/1/2000", periods=N, freq="1min")
ts = Series(np.random.randn(N), index=self.idx)[::2]
self.ts_reindexed = ts.reindex(self.idx)
- self.ts_float32 = self.ts_reindexed.astype('float32')
+ self.ts_float32 = self.ts_reindexed.astype("float32")
def time_reindexed(self, method):
self.ts_reindexed.fillna(method=method)
@@ -67,17 +65,17 @@ def time_float_32(self, method):
class LevelAlign:
-
def setup(self):
self.index = MultiIndex(
levels=[np.arange(10), np.arange(100), np.arange(100)],
- codes=[np.arange(10).repeat(10000),
- np.tile(np.arange(100).repeat(100), 10),
- np.tile(np.tile(np.arange(100), 100), 10)])
- self.df = DataFrame(np.random.randn(len(self.index), 4),
- index=self.index)
- self.df_level = DataFrame(np.random.randn(100, 4),
- index=self.index.levels[1])
+ codes=[
+ np.arange(10).repeat(10000),
+ np.tile(np.arange(100).repeat(100), 10),
+ np.tile(np.tile(np.arange(100), 100), 10),
+ ],
+ )
+ self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index)
+ self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1])
def time_align_level(self):
self.df.align(self.df_level, level=1, copy=False)
@@ -89,15 +87,16 @@ def time_reindex_level(self):
class DropDuplicates:
params = [True, False]
- param_names = ['inplace']
+ param_names = ["inplace"]
def setup(self, inplace):
N = 10000
K = 10
key1 = tm.makeStringIndex(N).values.repeat(K)
key2 = tm.makeStringIndex(N).values.repeat(K)
- self.df = DataFrame({'key1': key1, 'key2': key2,
- 'value': np.random.randn(N * K)})
+ self.df = DataFrame(
+ {"key1": key1, "key2": key2, "value": np.random.randn(N * K)}
+ )
self.df_nan = self.df.copy()
self.df_nan.iloc[:10000, :] = np.nan
@@ -107,15 +106,14 @@ def setup(self, inplace):
N = 1000000
K = 10000
key1 = np.random.randint(0, K, size=N)
- self.df_int = DataFrame({'key1': key1})
- self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10),
- dtype=bool))
+ self.df_int = DataFrame({"key1": key1})
+ self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), dtype=bool))
def time_frame_drop_dups(self, inplace):
- self.df.drop_duplicates(['key1', 'key2'], inplace=inplace)
+ self.df.drop_duplicates(["key1", "key2"], inplace=inplace)
def time_frame_drop_dups_na(self, inplace):
- self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace)
+ self.df_nan.drop_duplicates(["key1", "key2"], inplace=inplace)
def time_series_drop_dups_int(self, inplace):
self.s.drop_duplicates(inplace=inplace)
@@ -137,16 +135,16 @@ def setup(self):
indices = tm.makeStringIndex(n)
subsample_size = 40000
self.x = Series(np.random.randn(n), indices)
- self.y = Series(np.random.randn(subsample_size),
- index=np.random.choice(indices, subsample_size,
- replace=False))
+ self.y = Series(
+ np.random.randn(subsample_size),
+ index=np.random.choice(indices, subsample_size, replace=False),
+ )
def time_align_series_irregular_string(self):
self.x + self.y
class LibFastZip:
-
def setup(self):
N = 10000
K = 10
diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py
index 9dff1778f8e56..6137e944e6b9e 100644
--- a/asv_bench/benchmarks/replace.py
+++ b/asv_bench/benchmarks/replace.py
@@ -5,11 +5,11 @@
class FillNa:
params = [True, False]
- param_names = ['inplace']
+ param_names = ["inplace"]
def setup(self, inplace):
- N = 10**6
- rng = pd.date_range('1/1/2000', periods=N, freq='min')
+ N = 10 ** 6
+ rng = pd.date_range("1/1/2000", periods=N, freq="min")
data = np.random.randn(N)
data[::2] = np.nan
self.ts = pd.Series(data, index=rng)
@@ -24,13 +24,13 @@ def time_replace(self, inplace):
class ReplaceDict:
params = [True, False]
- param_names = ['inplace']
+ param_names = ["inplace"]
def setup(self, inplace):
- N = 10**5
- start_value = 10**5
+ N = 10 ** 5
+ start_value = 10 ** 5
self.to_rep = dict(enumerate(np.arange(N) + start_value))
- self.s = pd.Series(np.random.randint(N, size=10**3))
+ self.s = pd.Series(np.random.randint(N, size=10 ** 3))
def time_replace_series(self, inplace):
self.s.replace(self.to_rep, inplace=inplace)
@@ -38,14 +38,17 @@ def time_replace_series(self, inplace):
class Convert:
- params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta'])
- param_names = ['constructor', 'replace_data']
+ params = (["DataFrame", "Series"], ["Timestamp", "Timedelta"])
+ param_names = ["constructor", "replace_data"]
def setup(self, constructor, replace_data):
- N = 10**3
- data = {'Series': pd.Series(np.random.randint(N, size=N)),
- 'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N),
- 'B': np.random.randint(N, size=N)})}
+ N = 10 ** 3
+ data = {
+ "Series": pd.Series(np.random.randint(N, size=N)),
+ "DataFrame": pd.DataFrame(
+ {"A": np.random.randint(N, size=N), "B": np.random.randint(N, size=N)}
+ ),
+ }
self.to_replace = {i: getattr(pd, replace_data) for i in range(N)}
self.data = data[constructor]
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
index 678403d837805..f41e13163b3f5 100644
--- a/asv_bench/benchmarks/reshape.py
+++ b/asv_bench/benchmarks/reshape.py
@@ -7,35 +7,33 @@
class Melt:
-
def setup(self):
- self.df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C'])
- self.df['id1'] = np.random.randint(0, 10, 10000)
- self.df['id2'] = np.random.randint(100, 1000, 10000)
+ self.df = DataFrame(np.random.randn(10000, 3), columns=["A", "B", "C"])
+ self.df["id1"] = np.random.randint(0, 10, 10000)
+ self.df["id2"] = np.random.randint(100, 1000, 10000)
def time_melt_dataframe(self):
- melt(self.df, id_vars=['id1', 'id2'])
+ melt(self.df, id_vars=["id1", "id2"])
class Pivot:
-
def setup(self):
N = 10000
- index = date_range('1/1/2000', periods=N, freq='h')
- data = {'value': np.random.randn(N * 50),
- 'variable': np.arange(50).repeat(N),
- 'date': np.tile(index.values, 50)}
+ index = date_range("1/1/2000", periods=N, freq="h")
+ data = {
+ "value": np.random.randn(N * 50),
+ "variable": np.arange(50).repeat(N),
+ "date": np.tile(index.values, 50),
+ }
self.df = DataFrame(data)
def time_reshape_pivot_time_series(self):
- self.df.pivot('date', 'variable', 'value')
+ self.df.pivot("date", "variable", "value")
class SimpleReshape:
-
def setup(self):
- arrays = [np.arange(100).repeat(100),
- np.roll(np.tile(np.arange(100), 100), 25)]
+ arrays = [np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]
index = MultiIndex.from_arrays(arrays)
self.df = DataFrame(np.random.randn(10000, 4), index=index)
self.udf = self.df.unstack(1)
@@ -49,7 +47,7 @@ def time_unstack(self):
class Unstack:
- params = ['int', 'category']
+ params = ["int", "category"]
def setup(self, dtype):
m = 100
@@ -58,7 +56,7 @@ def setup(self, dtype):
levels = np.arange(m)
index = MultiIndex.from_product([levels] * 2)
columns = np.arange(n)
- if dtype == 'int':
+ if dtype == "int":
values = np.arange(m * m * n).reshape(m * m, n)
else:
# the category branch is ~20x slower than int. So we
@@ -80,84 +78,94 @@ def time_without_last_row(self, dtype):
class SparseIndex:
-
def setup(self):
NUM_ROWS = 1000
- self.df = DataFrame({'A': np.random.randint(50, size=NUM_ROWS),
- 'B': np.random.randint(50, size=NUM_ROWS),
- 'C': np.random.randint(-10, 10, size=NUM_ROWS),
- 'D': np.random.randint(-10, 10, size=NUM_ROWS),
- 'E': np.random.randint(10, size=NUM_ROWS),
- 'F': np.random.randn(NUM_ROWS)})
- self.df = self.df.set_index(['A', 'B', 'C', 'D', 'E'])
+ self.df = DataFrame(
+ {
+ "A": np.random.randint(50, size=NUM_ROWS),
+ "B": np.random.randint(50, size=NUM_ROWS),
+ "C": np.random.randint(-10, 10, size=NUM_ROWS),
+ "D": np.random.randint(-10, 10, size=NUM_ROWS),
+ "E": np.random.randint(10, size=NUM_ROWS),
+ "F": np.random.randn(NUM_ROWS),
+ }
+ )
+ self.df = self.df.set_index(["A", "B", "C", "D", "E"])
def time_unstack(self):
self.df.unstack()
class WideToLong:
-
def setup(self):
nyrs = 20
nidvars = 20
N = 5000
- self.letters = list('ABCD')
- yrvars = [l + str(num)
- for l, num in product(self.letters, range(1, nyrs + 1))]
+ self.letters = list("ABCD")
+ yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))]
columns = [str(i) for i in range(nidvars)] + yrvars
- self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)),
- columns=columns)
- self.df['id'] = self.df.index
+ self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns)
+ self.df["id"] = self.df.index
def time_wide_to_long_big(self):
- wide_to_long(self.df, self.letters, i='id', j='year')
+ wide_to_long(self.df, self.letters, i="id", j="year")
class PivotTable:
-
def setup(self):
N = 100000
- fac1 = np.array(['A', 'B', 'C'], dtype='O')
- fac2 = np.array(['one', 'two'], dtype='O')
+ fac1 = np.array(["A", "B", "C"], dtype="O")
+ fac2 = np.array(["one", "two"], dtype="O")
ind1 = np.random.randint(0, 3, size=N)
ind2 = np.random.randint(0, 2, size=N)
- self.df = DataFrame({'key1': fac1.take(ind1),
- 'key2': fac2.take(ind2),
- 'key3': fac2.take(ind2),
- 'value1': np.random.randn(N),
- 'value2': np.random.randn(N),
- 'value3': np.random.randn(N)})
- self.df2 = DataFrame({'col1': list('abcde'), 'col2': list('fghij'),
- 'col3': [1, 2, 3, 4, 5]})
- self.df2.col1 = self.df2.col1.astype('category')
- self.df2.col2 = self.df2.col2.astype('category')
+ self.df = DataFrame(
+ {
+ "key1": fac1.take(ind1),
+ "key2": fac2.take(ind2),
+ "key3": fac2.take(ind2),
+ "value1": np.random.randn(N),
+ "value2": np.random.randn(N),
+ "value3": np.random.randn(N),
+ }
+ )
+ self.df2 = DataFrame(
+ {"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]}
+ )
+ self.df2.col1 = self.df2.col1.astype("category")
+ self.df2.col2 = self.df2.col2.astype("category")
def time_pivot_table(self):
- self.df.pivot_table(index='key1', columns=['key2', 'key3'])
+ self.df.pivot_table(index="key1", columns=["key2", "key3"])
def time_pivot_table_agg(self):
- self.df.pivot_table(index='key1', columns=['key2', 'key3'],
- aggfunc=['sum', 'mean'])
+ self.df.pivot_table(
+ index="key1", columns=["key2", "key3"], aggfunc=["sum", "mean"]
+ )
def time_pivot_table_margins(self):
- self.df.pivot_table(index='key1', columns=['key2', 'key3'],
- margins=True)
+ self.df.pivot_table(index="key1", columns=["key2", "key3"], margins=True)
def time_pivot_table_categorical(self):
- self.df2.pivot_table(index='col1', values='col3', columns='col2',
- aggfunc=np.sum, fill_value=0)
+ self.df2.pivot_table(
+ index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0
+ )
def time_pivot_table_categorical_observed(self):
- self.df2.pivot_table(index='col1', values='col3', columns='col2',
- aggfunc=np.sum, fill_value=0, observed=True)
+ self.df2.pivot_table(
+ index="col1",
+ values="col3",
+ columns="col2",
+ aggfunc=np.sum,
+ fill_value=0,
+ observed=True,
+ )
class Crosstab:
-
def setup(self):
N = 100000
- fac1 = np.array(['A', 'B', 'C'], dtype='O')
- fac2 = np.array(['one', 'two'], dtype='O')
+ fac1 = np.array(["A", "B", "C"], dtype="O")
+ fac2 = np.array(["one", "two"], dtype="O")
self.ind1 = np.random.randint(0, 3, size=N)
self.ind2 = np.random.randint(0, 2, size=N)
self.vec1 = fac1.take(self.ind1)
@@ -167,7 +175,7 @@ def time_crosstab(self):
pd.crosstab(self.vec1, self.vec2)
def time_crosstab_values(self):
- pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc='sum')
+ pd.crosstab(self.vec1, self.vec2, values=self.ind1, aggfunc="sum")
def time_crosstab_normalize(self):
pd.crosstab(self.vec1, self.vec2, normalize=True)
@@ -179,8 +187,10 @@ def time_crosstab_normalize_margins(self):
class GetDummies:
def setup(self):
categories = list(string.ascii_letters[:12])
- s = pd.Series(np.random.choice(categories, size=1000000),
- dtype=pd.api.types.CategoricalDtype(categories))
+ s = pd.Series(
+ np.random.choice(categories, size=1000000),
+ dtype=pd.api.types.CategoricalDtype(categories),
+ )
self.s = s
def time_get_dummies_1d(self):
@@ -192,16 +202,18 @@ def time_get_dummies_1d_sparse(self):
class Cut:
params = [[4, 10, 1000]]
- param_names = ['bins']
+ param_names = ["bins"]
def setup(self, bins):
- N = 10**5
+ N = 10 ** 5
self.int_series = pd.Series(np.arange(N).repeat(5))
self.float_series = pd.Series(np.random.randn(N).repeat(5))
- self.timedelta_series = pd.Series(np.random.randint(N, size=N),
- dtype='timedelta64[ns]')
- self.datetime_series = pd.Series(np.random.randint(N, size=N),
- dtype='datetime64[ns]')
+ self.timedelta_series = pd.Series(
+ np.random.randint(N, size=N), dtype="timedelta64[ns]"
+ )
+ self.datetime_series = pd.Series(
+ np.random.randint(N, size=N), dtype="datetime64[ns]"
+ )
def time_cut_int(self, bins):
pd.cut(self.int_series, bins)
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
index 033b466c8b9be..a70977fcf539f 100644
--- a/asv_bench/benchmarks/rolling.py
+++ b/asv_bench/benchmarks/rolling.py
@@ -4,15 +4,16 @@
class Methods:
- params = (['DataFrame', 'Series'],
- [10, 1000],
- ['int', 'float'],
- ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
- 'sum'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (
+ ["DataFrame", "Series"],
+ [10, 1000],
+ ["int", "float"],
+ ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
+ )
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, window, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
self.roll = getattr(pd, constructor)(arr).rolling(window)
@@ -22,14 +23,15 @@ def time_rolling(self, constructor, window, dtype, method):
class ExpandingMethods:
- params = (['DataFrame', 'Series'],
- ['int', 'float'],
- ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
- 'sum'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (
+ ["DataFrame", "Series"],
+ ["int", "float"],
+ ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
+ )
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
self.expanding = getattr(pd, constructor)(arr).expanding()
@@ -39,14 +41,11 @@ def time_expanding(self, constructor, dtype, method):
class EWMMethods:
- params = (['DataFrame', 'Series'],
- [10, 1000],
- ['int', 'float'],
- ['mean', 'std'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (["DataFrame", "Series"], [10, 1000], ["int", "float"], ["mean", "std"])
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, window, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
self.ewm = getattr(pd, constructor)(arr).ewm(halflife=window)
@@ -55,29 +54,28 @@ def time_ewm(self, constructor, window, dtype, method):
class VariableWindowMethods(Methods):
- params = (['DataFrame', 'Series'],
- ['50s', '1h', '1d'],
- ['int', 'float'],
- ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt',
- 'sum'])
- param_names = ['contructor', 'window', 'dtype', 'method']
+ params = (
+ ["DataFrame", "Series"],
+ ["50s", "1h", "1d"],
+ ["int", "float"],
+ ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum"],
+ )
+ param_names = ["contructor", "window", "dtype", "method"]
def setup(self, constructor, window, dtype, method):
- N = 10**5
+ N = 10 ** 5
arr = (100 * np.random.random(N)).astype(dtype)
- index = pd.date_range('2017-01-01', periods=N, freq='5s')
+ index = pd.date_range("2017-01-01", periods=N, freq="5s")
self.roll = getattr(pd, constructor)(arr, index=index).rolling(window)
class Pairwise:
- params = ([10, 1000, None],
- ['corr', 'cov'],
- [True, False])
- param_names = ['window', 'method', 'pairwise']
+ params = ([10, 1000, None], ["corr", "cov"], [True, False])
+ param_names = ["window", "method", "pairwise"]
def setup(self, window, method, pairwise):
- N = 10**4
+ N = 10 ** 4
arr = np.random.random(N)
self.df = pd.DataFrame(arr)
@@ -90,25 +88,25 @@ def time_pairwise(self, window, method, pairwise):
class Quantile:
- params = (['DataFrame', 'Series'],
- [10, 1000],
- ['int', 'float'],
- [0, 0.5, 1],
- ['linear', 'nearest', 'lower', 'higher', 'midpoint'])
- param_names = ['constructor', 'window', 'dtype', 'percentile']
+ params = (
+ ["DataFrame", "Series"],
+ [10, 1000],
+ ["int", "float"],
+ [0, 0.5, 1],
+ ["linear", "nearest", "lower", "higher", "midpoint"],
+ )
+ param_names = ["constructor", "window", "dtype", "percentile"]
def setup(self, constructor, window, dtype, percentile, interpolation):
N = 10 ** 5
arr = np.random.random(N).astype(dtype)
self.roll = getattr(pd, constructor)(arr).rolling(window)
- def time_quantile(self, constructor, window, dtype, percentile,
- interpolation):
+ def time_quantile(self, constructor, window, dtype, percentile, interpolation):
self.roll.quantile(percentile, interpolation=interpolation)
class PeakMemFixed:
-
def setup(self):
N = 10
arr = 100 * np.random.random(N)
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index 4b1af2dc8c932..e2835c5156f55 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -7,13 +7,13 @@
class SeriesConstructor:
- params = [None, 'dict']
- param_names = ['data']
+ params = [None, "dict"]
+ param_names = ["data"]
def setup(self, data):
- self.idx = date_range(start=datetime(2015, 10, 26),
- end=datetime(2016, 1, 1),
- freq='50s')
+ self.idx = date_range(
+ start=datetime(2015, 10, 26), end=datetime(2016, 1, 1), freq="50s"
+ )
dict_data = dict(zip(self.idx, range(len(self.idx))))
self.data = None if data is None else dict_data
@@ -23,8 +23,8 @@ def time_constructor(self, data):
class IsIn:
- params = ['int64', 'uint64', 'object']
- param_names = ['dtype']
+ params = ["int64", "uint64", "object"]
+ param_names = ["dtype"]
def setup(self, dtype):
self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype)
@@ -35,12 +35,11 @@ def time_isin(self, dtypes):
class IsInFloat64:
-
def setup(self):
self.small = Series([1, 2], dtype=np.float64)
- self.many_different_values = np.arange(10**6, dtype=np.float64)
- self.few_different_values = np.zeros(10**7, dtype=np.float64)
- self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64)
+ self.many_different_values = np.arange(10 ** 6, dtype=np.float64)
+ self.few_different_values = np.zeros(10 ** 7, dtype=np.float64)
+ self.only_nans_values = np.full(10 ** 7, np.nan, dtype=np.float64)
def time_isin_many_different(self):
# runtime is dominated by creation of the lookup-table
@@ -56,19 +55,18 @@ def time_isin_nan_values(self):
class IsInForObjects:
-
def setup(self):
- self.s_nans = Series(np.full(10**4, np.nan)).astype(np.object)
- self.vals_nans = np.full(10**4, np.nan).astype(np.object)
+ self.s_nans = Series(np.full(10 ** 4, np.nan)).astype(np.object)
+ self.vals_nans = np.full(10 ** 4, np.nan).astype(np.object)
self.s_short = Series(np.arange(2)).astype(np.object)
- self.s_long = Series(np.arange(10**5)).astype(np.object)
+ self.s_long = Series(np.arange(10 ** 5)).astype(np.object)
self.vals_short = np.arange(2).astype(np.object)
- self.vals_long = np.arange(10**5).astype(np.object)
+ self.vals_long = np.arange(10 ** 5).astype(np.object)
# because of nans floats are special:
- self.s_long_floats = Series(np.arange(10**5,
- dtype=np.float)).astype(np.object)
- self.vals_long_floats = np.arange(10**5,
- dtype=np.float).astype(np.object)
+ self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype(
+ np.object
+ )
+ self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(np.object)
def time_isin_nans(self):
# if nan-objects are different objects,
@@ -94,8 +92,8 @@ def time_isin_long_series_long_values_floats(self):
class NSort:
- params = ['first', 'last', 'all']
- param_names = ['keep']
+ params = ["first", "last", "all"]
+ param_names = ["keep"]
def setup(self, keep):
self.s = Series(np.random.randint(1, 10, 100000))
@@ -109,15 +107,17 @@ def time_nsmallest(self, keep):
class Dropna:
- params = ['int', 'datetime']
- param_names = ['dtype']
+ params = ["int", "datetime"]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**6
- data = {'int': np.random.randint(1, 10, N),
- 'datetime': date_range('2000-01-01', freq='S', periods=N)}
+ N = 10 ** 6
+ data = {
+ "int": np.random.randint(1, 10, N),
+ "datetime": date_range("2000-01-01", freq="S", periods=N),
+ }
self.s = Series(data[dtype])
- if dtype == 'datetime':
+ if dtype == "datetime":
self.s[np.random.randint(1, N, 100)] = NaT
def time_dropna(self, dtype):
@@ -127,37 +127,47 @@ def time_dropna(self, dtype):
class SearchSorted:
goal_time = 0.2
- params = ['int8', 'int16', 'int32', 'int64',
- 'uint8', 'uint16', 'uint32', 'uint64',
- 'float16', 'float32', 'float64',
- 'str']
- param_names = ['dtype']
+ params = [
+ "int8",
+ "int16",
+ "int32",
+ "int64",
+ "uint8",
+ "uint16",
+ "uint32",
+ "uint64",
+ "float16",
+ "float32",
+ "float64",
+ "str",
+ ]
+ param_names = ["dtype"]
def setup(self, dtype):
- N = 10**5
+ N = 10 ** 5
data = np.array([1] * N + [2] * N + [3] * N).astype(dtype)
self.s = Series(data)
def time_searchsorted(self, dtype):
- key = '2' if dtype == 'str' else 2
+ key = "2" if dtype == "str" else 2
self.s.searchsorted(key)
class Map:
- params = (['dict', 'Series', 'lambda'], ['object', 'category', 'int'])
- param_names = 'mapper'
+ params = (["dict", "Series", "lambda"], ["object", "category", "int"])
+ param_names = "mapper"
def setup(self, mapper, dtype):
map_size = 1000
map_data = Series(map_size - np.arange(map_size), dtype=dtype)
# construct mapper
- if mapper == 'Series':
+ if mapper == "Series":
self.map_data = map_data
- elif mapper == 'dict':
+ elif mapper == "dict":
self.map_data = map_data.to_dict()
- elif mapper == 'lambda':
+ elif mapper == "lambda":
map_dict = map_data.to_dict()
self.map_data = lambda x: map_dict[x]
else:
@@ -170,8 +180,8 @@ def time_map(self, mapper, *args, **kwargs):
class Clip:
- params = [50, 1000, 10**5]
- param_names = ['n']
+ params = [50, 1000, 10 ** 5]
+ param_names = ["n"]
def setup(self, n):
self.s = Series(np.random.randn(n))
@@ -182,8 +192,8 @@ def time_clip(self, n):
class ValueCounts:
- params = ['int', 'uint', 'float', 'object']
- param_names = ['dtype']
+ params = ["int", "uint", "float", "object"]
+ param_names = ["dtype"]
def setup(self, dtype):
self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype)
@@ -193,7 +203,6 @@ def time_value_counts(self, dtype):
class Dir:
-
def setup(self):
self.s = Series(index=tm.makeStringIndex(10000))
@@ -204,21 +213,19 @@ def time_dir_strings(self):
class SeriesGetattr:
# https://github.com/pandas-dev/pandas/issues/19764
def setup(self):
- self.s = Series(1,
- index=date_range("2012-01-01", freq='s',
- periods=int(1e6)))
+ self.s = Series(1, index=date_range("2012-01-01", freq="s", periods=int(1e6)))
def time_series_datetimeindex_repr(self):
- getattr(self.s, 'a', None)
+ getattr(self.s, "a", None)
class All(object):
- params = [[10**3, 10**6], ['fast', 'slow']]
- param_names = ['N', 'case']
+ params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
+ param_names = ["N", "case"]
def setup(self, N, case):
- val = case != 'fast'
+ val = case != "fast"
self.s = Series([val] * N)
def time_all(self, N, case):
@@ -227,11 +234,11 @@ def time_all(self, N, case):
class Any(object):
- params = [[10**3, 10**6], ['fast', 'slow']]
- param_names = ['N', 'case']
+ params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
+ param_names = ["N", "case"]
def setup(self, N, case):
- val = case == 'fast'
+ val = case == "fast"
self.s = Series([val] * N)
def time_any(self, N, case):
@@ -240,11 +247,25 @@ def time_any(self, N, case):
class NanOps(object):
- params = [['var', 'mean', 'median', 'max', 'min', 'sum', 'std', 'sem',
- 'argmax', 'skew', 'kurt', 'prod'],
- [10**3, 10**6],
- ['int8', 'int32', 'int64', 'float64']]
- param_names = ['func', 'N', 'dtype']
+ params = [
+ [
+ "var",
+ "mean",
+ "median",
+ "max",
+ "min",
+ "sum",
+ "std",
+ "sem",
+ "argmax",
+ "skew",
+ "kurt",
+ "prod",
+ ],
+ [10 ** 3, 10 ** 6],
+ ["int8", "int32", "int64", "float64"],
+ ]
+ param_names = ["func", "N", "dtype"]
def setup(self, func, N, dtype):
self.s = Series([1] * N, dtype=dtype)
diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py
index 281e81f21ba9c..19d08c086a508 100644
--- a/asv_bench/benchmarks/sparse.py
+++ b/asv_bench/benchmarks/sparse.py
@@ -14,11 +14,10 @@ def make_array(size, dense_proportion, fill_value, dtype):
class SparseSeriesToFrame:
-
def setup(self):
K = 50
N = 50001
- rng = date_range('1/1/2000', periods=N, freq='T')
+ rng = date_range("1/1/2000", periods=N, freq="T")
self.series = {}
for i in range(1, K):
data = np.random.randn(N)[:-i]
@@ -32,12 +31,11 @@ def time_series_to_frame(self):
class SparseArrayConstructor:
- params = ([0.1, 0.01], [0, np.nan],
- [np.int64, np.float64, np.object])
- param_names = ['dense_proportion', 'fill_value', 'dtype']
+ params = ([0.1, 0.01], [0, np.nan], [np.int64, np.float64, np.object])
+ param_names = ["dense_proportion", "fill_value", "dtype"]
def setup(self, dense_proportion, fill_value, dtype):
- N = 10**6
+ N = 10 ** 6
self.array = make_array(N, dense_proportion, fill_value, dtype)
def time_sparse_array(self, dense_proportion, fill_value, dtype):
@@ -45,7 +43,6 @@ def time_sparse_array(self, dense_proportion, fill_value, dtype):
class SparseDataFrameConstructor:
-
def setup(self):
N = 1000
self.arr = np.arange(N)
@@ -56,18 +53,16 @@ def time_from_scipy(self):
class FromCoo:
-
def setup(self):
- self.matrix = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0],
- ([1, 0, 0], [0, 2, 3])),
- shape=(100, 100))
+ self.matrix = scipy.sparse.coo_matrix(
+ ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100)
+ )
def time_sparse_series_from_coo(self):
pd.Series.sparse.from_coo(self.matrix)
class ToCoo:
-
def setup(self):
s = Series([np.nan] * 10000)
s[0] = 3.0
@@ -77,18 +72,16 @@ def setup(self):
self.ss = s.astype("Sparse")
def time_sparse_series_to_coo(self):
- self.ss.sparse.to_coo(row_levels=[0, 1],
- column_levels=[2, 3],
- sort_labels=True)
+ self.ss.sparse.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)
class Arithmetic:
params = ([0.1, 0.01], [0, np.nan])
- param_names = ['dense_proportion', 'fill_value']
+ param_names = ["dense_proportion", "fill_value"]
def setup(self, dense_proportion, fill_value):
- N = 10**6
+ N = 10 ** 6
arr1 = make_array(N, dense_proportion, fill_value, np.int64)
self.array1 = SparseArray(arr1, fill_value=fill_value)
arr2 = make_array(N, dense_proportion, fill_value, np.int64)
@@ -110,22 +103,24 @@ def time_divide(self, dense_proportion, fill_value):
class ArithmeticBlock:
params = [np.nan, 0]
- param_names = ['fill_value']
+ param_names = ["fill_value"]
def setup(self, fill_value):
- N = 10**6
- self.arr1 = self.make_block_array(length=N, num_blocks=1000,
- block_size=10, fill_value=fill_value)
- self.arr2 = self.make_block_array(length=N, num_blocks=1000,
- block_size=10, fill_value=fill_value)
+ N = 10 ** 6
+ self.arr1 = self.make_block_array(
+ length=N, num_blocks=1000, block_size=10, fill_value=fill_value
+ )
+ self.arr2 = self.make_block_array(
+ length=N, num_blocks=1000, block_size=10, fill_value=fill_value
+ )
def make_block_array(self, length, num_blocks, block_size, fill_value):
arr = np.full(length, fill_value)
- indicies = np.random.choice(np.arange(0, length, block_size),
- num_blocks,
- replace=False)
+ indicies = np.random.choice(
+ np.arange(0, length, block_size), num_blocks, replace=False
+ )
for ind in indicies:
- arr[ind:ind + block_size] = np.random.randint(0, 100, block_size)
+ arr[ind : ind + block_size] = np.random.randint(0, 100, block_size)
return SparseArray(arr, fill_value=fill_value)
def time_make_union(self, fill_value):
diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py
index 3514335f92e77..620a6de0f5f34 100644
--- a/asv_bench/benchmarks/stat_ops.py
+++ b/asv_bench/benchmarks/stat_ops.py
@@ -2,14 +2,13 @@
import pandas as pd
-ops = ['mean', 'sum', 'median', 'std', 'skew', 'kurt', 'mad', 'prod', 'sem',
- 'var']
+ops = ["mean", "sum", "median", "std", "skew", "kurt", "mad", "prod", "sem", "var"]
class FrameOps:
- params = [ops, ['float', 'int'], [0, 1], [True, False]]
- param_names = ['op', 'dtype', 'axis', 'use_bottleneck']
+ params = [ops, ["float", "int"], [0, 1], [True, False]]
+ param_names = ["op", "dtype", "axis", "use_bottleneck"]
def setup(self, op, dtype, axis, use_bottleneck):
df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype)
@@ -17,6 +16,7 @@ def setup(self, op, dtype, axis, use_bottleneck):
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.df_func = getattr(df, op)
@@ -27,13 +27,15 @@ def time_op(self, op, dtype, axis, use_bottleneck):
class FrameMultiIndexOps:
params = ([0, 1, [0, 1]], ops)
- param_names = ['level', 'op']
+ param_names = ["level", "op"]
def setup(self, level, op):
levels = [np.arange(10), np.arange(100), np.arange(100)]
- codes = [np.arange(10).repeat(10000),
- np.tile(np.arange(100).repeat(100), 10),
- np.tile(np.tile(np.arange(100), 100), 10)]
+ codes = [
+ np.arange(10).repeat(10000),
+ np.tile(np.arange(100).repeat(100), 10),
+ np.tile(np.tile(np.arange(100), 100), 10),
+ ]
index = pd.MultiIndex(levels=levels, codes=codes)
df = pd.DataFrame(np.random.randn(len(index), 4), index=index)
self.df_func = getattr(df, op)
@@ -44,8 +46,8 @@ def time_op(self, level, op):
class SeriesOps:
- params = [ops, ['float', 'int'], [True, False]]
- param_names = ['op', 'dtype', 'use_bottleneck']
+ params = [ops, ["float", "int"], [True, False]]
+ param_names = ["op", "dtype", "use_bottleneck"]
def setup(self, op, dtype, use_bottleneck):
s = pd.Series(np.random.randn(100000)).astype(dtype)
@@ -53,6 +55,7 @@ def setup(self, op, dtype, use_bottleneck):
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.s_func = getattr(s, op)
@@ -63,13 +66,15 @@ def time_op(self, op, dtype, use_bottleneck):
class SeriesMultiIndexOps:
params = ([0, 1, [0, 1]], ops)
- param_names = ['level', 'op']
+ param_names = ["level", "op"]
def setup(self, level, op):
levels = [np.arange(10), np.arange(100), np.arange(100)]
- codes = [np.arange(10).repeat(10000),
- np.tile(np.arange(100).repeat(100), 10),
- np.tile(np.tile(np.arange(100), 100), 10)]
+ codes = [
+ np.arange(10).repeat(10000),
+ np.tile(np.arange(100).repeat(100), 10),
+ np.tile(np.tile(np.arange(100), 100), 10),
+ ]
index = pd.MultiIndex(levels=levels, codes=codes)
s = pd.Series(np.random.randn(len(index)), index=index)
self.s_func = getattr(s, op)
@@ -80,11 +85,11 @@ def time_op(self, level, op):
class Rank:
- params = [['DataFrame', 'Series'], [True, False]]
- param_names = ['constructor', 'pct']
+ params = [["DataFrame", "Series"], [True, False]]
+ param_names = ["constructor", "pct"]
def setup(self, constructor, pct):
- values = np.random.randn(10**5)
+ values = np.random.randn(10 ** 5)
self.data = getattr(pd, constructor)(values)
def time_rank(self, constructor, pct):
@@ -96,14 +101,15 @@ def time_average_old(self, constructor, pct):
class Correlation:
- params = [['spearman', 'kendall', 'pearson'], [True, False]]
- param_names = ['method', 'use_bottleneck']
+ params = [["spearman", "kendall", "pearson"], [True, False]]
+ param_names = ["method", "use_bottleneck"]
def setup(self, method, use_bottleneck):
try:
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.df = pd.DataFrame(np.random.randn(1000, 30))
self.df2 = pd.DataFrame(np.random.randn(1000, 30))
@@ -126,13 +132,14 @@ def time_corrwith_rows(self, method, use_bottleneck):
class Covariance:
params = [[True, False]]
- param_names = ['use_bottleneck']
+ param_names = ["use_bottleneck"]
def setup(self, use_bottleneck):
try:
pd.options.compute.use_bottleneck = use_bottleneck
except TypeError:
from pandas.core import nanops
+
nanops._USE_BOTTLENECK = use_bottleneck
self.s = pd.Series(np.random.randn(100000))
self.s2 = pd.Series(np.random.randn(100000))
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 5dbcc71b7455e..6be2fa92d9eac 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -6,31 +6,30 @@
class Methods:
-
def setup(self):
- self.s = Series(tm.makeStringIndex(10**5))
+ self.s = Series(tm.makeStringIndex(10 ** 5))
def time_center(self):
self.s.str.center(100)
def time_count(self):
- self.s.str.count('A')
+ self.s.str.count("A")
def time_endswith(self):
- self.s.str.endswith('A')
+ self.s.str.endswith("A")
def time_extract(self):
with warnings.catch_warnings(record=True):
- self.s.str.extract('(\\w*)A(\\w*)')
+ self.s.str.extract("(\\w*)A(\\w*)")
def time_findall(self):
- self.s.str.findall('[A-Z]+')
+ self.s.str.findall("[A-Z]+")
def time_find(self):
- self.s.str.find('[A-Z]+')
+ self.s.str.find("[A-Z]+")
def time_rfind(self):
- self.s.str.rfind('[A-Z]+')
+ self.s.str.rfind("[A-Z]+")
def time_get(self):
self.s.str.get(0)
@@ -39,43 +38,43 @@ def time_len(self):
self.s.str.len()
def time_join(self):
- self.s.str.join(' ')
+ self.s.str.join(" ")
def time_match(self):
- self.s.str.match('A')
+ self.s.str.match("A")
def time_normalize(self):
- self.s.str.normalize('NFC')
+ self.s.str.normalize("NFC")
def time_pad(self):
- self.s.str.pad(100, side='both')
+ self.s.str.pad(100, side="both")
def time_partition(self):
- self.s.str.partition('A')
+ self.s.str.partition("A")
def time_rpartition(self):
- self.s.str.rpartition('A')
+ self.s.str.rpartition("A")
def time_replace(self):
- self.s.str.replace('A', '\x01\x01')
+ self.s.str.replace("A", "\x01\x01")
def time_translate(self):
- self.s.str.translate({'A': '\x01\x01'})
+ self.s.str.translate({"A": "\x01\x01"})
def time_slice(self):
self.s.str.slice(5, 15, 2)
def time_startswith(self):
- self.s.str.startswith('A')
+ self.s.str.startswith("A")
def time_strip(self):
- self.s.str.strip('A')
+ self.s.str.strip("A")
def time_rstrip(self):
- self.s.str.rstrip('A')
+ self.s.str.rstrip("A")
def time_lstrip(self):
- self.s.str.lstrip('A')
+ self.s.str.lstrip("A")
def time_title(self):
self.s.str.title()
@@ -95,13 +94,13 @@ def time_zfill(self):
class Repeat:
- params = ['int', 'array']
- param_names = ['repeats']
+ params = ["int", "array"]
+ param_names = ["repeats"]
def setup(self, repeats):
- N = 10**5
+ N = 10 ** 5
self.s = Series(tm.makeStringIndex(N))
- repeat = {'int': 1, 'array': np.random.randint(1, 3, N)}
+ repeat = {"int": 1, "array": np.random.randint(1, 3, N)}
self.values = repeat[repeats]
def time_repeat(self, repeats):
@@ -110,20 +109,20 @@ def time_repeat(self, repeats):
class Cat:
- params = ([0, 3], [None, ','], [None, '-'], [0.0, 0.001, 0.15])
- param_names = ['other_cols', 'sep', 'na_rep', 'na_frac']
+ params = ([0, 3], [None, ","], [None, "-"], [0.0, 0.001, 0.15])
+ param_names = ["other_cols", "sep", "na_rep", "na_frac"]
def setup(self, other_cols, sep, na_rep, na_frac):
N = 10 ** 5
- mask_gen = lambda: np.random.choice([True, False], N,
- p=[1 - na_frac, na_frac])
+ mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac])
self.s = Series(tm.makeStringIndex(N)).where(mask_gen())
if other_cols == 0:
# str.cat self-concatenates only for others=None
self.others = None
else:
- self.others = DataFrame({i: tm.makeStringIndex(N).where(mask_gen())
- for i in range(other_cols)})
+ self.others = DataFrame(
+ {i: tm.makeStringIndex(N).where(mask_gen()) for i in range(other_cols)}
+ )
def time_cat(self, other_cols, sep, na_rep, na_frac):
# before the concatenation (one caller + other_cols columns), the total
@@ -136,52 +135,49 @@ def time_cat(self, other_cols, sep, na_rep, na_frac):
class Contains:
params = [True, False]
- param_names = ['regex']
+ param_names = ["regex"]
def setup(self, regex):
- self.s = Series(tm.makeStringIndex(10**5))
+ self.s = Series(tm.makeStringIndex(10 ** 5))
def time_contains(self, regex):
- self.s.str.contains('A', regex=regex)
+ self.s.str.contains("A", regex=regex)
class Split:
params = [True, False]
- param_names = ['expand']
+ param_names = ["expand"]
def setup(self, expand):
- self.s = Series(tm.makeStringIndex(10**5)).str.join('--')
+ self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("--")
def time_split(self, expand):
- self.s.str.split('--', expand=expand)
+ self.s.str.split("--", expand=expand)
def time_rsplit(self, expand):
- self.s.str.rsplit('--', expand=expand)
+ self.s.str.rsplit("--", expand=expand)
class Dummies:
-
def setup(self):
- self.s = Series(tm.makeStringIndex(10**5)).str.join('|')
+ self.s = Series(tm.makeStringIndex(10 ** 5)).str.join("|")
def time_get_dummies(self):
- self.s.str.get_dummies('|')
+ self.s.str.get_dummies("|")
class Encode:
-
def setup(self):
self.ser = Series(tm.makeUnicodeIndex())
def time_encode_decode(self):
- self.ser.str.encode('utf-8').str.decode('utf-8')
+ self.ser.str.encode("utf-8").str.decode("utf-8")
class Slice:
-
def setup(self):
- self.s = Series(['abcdefg', np.nan] * 500000)
+ self.s = Series(["abcdefg", np.nan] * 500000)
def time_vector_slice(self):
# GH 2602
diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py
index c4fe462944a2a..36a9db529f98f 100644
--- a/asv_bench/benchmarks/timedelta.py
+++ b/asv_bench/benchmarks/timedelta.py
@@ -3,49 +3,60 @@
import numpy as np
from pandas import (
- DataFrame, Series, Timedelta, Timestamp, timedelta_range, to_timedelta)
+ DataFrame,
+ Series,
+ Timedelta,
+ Timestamp,
+ timedelta_range,
+ to_timedelta,
+)
class TimedeltaConstructor:
-
def time_from_int(self):
Timedelta(123456789)
def time_from_unit(self):
- Timedelta(1, unit='d')
+ Timedelta(1, unit="d")
def time_from_components(self):
- Timedelta(days=1, hours=2, minutes=3, seconds=4, milliseconds=5,
- microseconds=6, nanoseconds=7)
+ Timedelta(
+ days=1,
+ hours=2,
+ minutes=3,
+ seconds=4,
+ milliseconds=5,
+ microseconds=6,
+ nanoseconds=7,
+ )
def time_from_datetime_timedelta(self):
Timedelta(datetime.timedelta(days=1, seconds=1))
def time_from_np_timedelta(self):
- Timedelta(np.timedelta64(1, 'ms'))
+ Timedelta(np.timedelta64(1, "ms"))
def time_from_string(self):
- Timedelta('1 days')
+ Timedelta("1 days")
def time_from_iso_format(self):
- Timedelta('P4DT12H30M5S')
+ Timedelta("P4DT12H30M5S")
def time_from_missing(self):
- Timedelta('nat')
+ Timedelta("nat")
class ToTimedelta:
-
def setup(self):
self.ints = np.random.randint(0, 60, size=10000)
self.str_days = []
self.str_seconds = []
for i in self.ints:
- self.str_days.append('{0} days'.format(i))
- self.str_seconds.append('00:00:{0:02d}'.format(i))
+ self.str_days.append("{0} days".format(i))
+ self.str_seconds.append("00:00:{0:02d}".format(i))
def time_convert_int(self):
- to_timedelta(self.ints, unit='s')
+ to_timedelta(self.ints, unit="s")
def time_convert_string_days(self):
to_timedelta(self.str_days)
@@ -56,30 +67,28 @@ def time_convert_string_seconds(self):
class ToTimedeltaErrors:
- params = ['coerce', 'ignore']
- param_names = ['errors']
+ params = ["coerce", "ignore"]
+ param_names = ["errors"]
def setup(self, errors):
ints = np.random.randint(0, 60, size=10000)
- self.arr = ['{0} days'.format(i) for i in ints]
- self.arr[-1] = 'apple'
+ self.arr = ["{0} days".format(i) for i in ints]
+ self.arr[-1] = "apple"
def time_convert(self, errors):
to_timedelta(self.arr, errors=errors)
class TimedeltaOps:
-
def setup(self):
self.td = to_timedelta(np.arange(1000000))
- self.ts = Timestamp('2000')
+ self.ts = Timestamp("2000")
def time_add_td_ts(self):
self.td + self.ts
class TimedeltaProperties:
-
def setup_cache(self):
td = Timedelta(days=365, minutes=35, seconds=25, milliseconds=35)
return td
@@ -98,10 +107,9 @@ def time_timedelta_nanoseconds(self, td):
class DatetimeAccessor:
-
def setup_cache(self):
N = 100000
- series = Series(timedelta_range('1 days', periods=N, freq='h'))
+ series = Series(timedelta_range("1 days", periods=N, freq="h"))
return series
def time_dt_accessor(self, series):
@@ -121,10 +129,9 @@ def time_timedelta_nanoseconds(self, series):
class TimedeltaIndexing:
-
def setup(self):
- self.index = timedelta_range(start='1985', periods=1000, freq='D')
- self.index2 = timedelta_range(start='1986', periods=1000, freq='D')
+ self.index = timedelta_range(start="1985", periods=1000, freq="D")
+ self.index2 = timedelta_range(start="1986", periods=1000, freq="D")
self.series = Series(range(1000), index=self.index)
self.timedelta = self.index[500]
@@ -141,7 +148,7 @@ def time_series_loc(self):
self.series.loc[self.timedelta]
def time_align(self):
- DataFrame({'a': self.series, 'b': self.series[:500]})
+ DataFrame({"a": self.series, "b": self.series[:500]})
def time_intersection(self):
self.index.intersection(self.index2)
diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
index 14ee8747cf81d..a74527df25f9b 100644
--- a/asv_bench/benchmarks/timeseries.py
+++ b/asv_bench/benchmarks/timeseries.py
@@ -4,6 +4,7 @@
import numpy as np
from pandas import to_datetime, date_range, Series, DataFrame, period_range
from pandas.tseries.frequencies import infer_freq
+
try:
from pandas.plotting._matplotlib.converter import DatetimeConverter
except ImportError:
@@ -12,27 +13,22 @@
class DatetimeIndex:
- params = ['dst', 'repeated', 'tz_aware', 'tz_local', 'tz_naive']
- param_names = ['index_type']
+ params = ["dst", "repeated", "tz_aware", "tz_local", "tz_naive"]
+ param_names = ["index_type"]
def setup(self, index_type):
N = 100000
- dtidxes = {'dst': date_range(start='10/29/2000 1:00:00',
- end='10/29/2000 1:59:59', freq='S'),
- 'repeated': date_range(start='2000',
- periods=N / 10,
- freq='s').repeat(10),
- 'tz_aware': date_range(start='2000',
- periods=N,
- freq='s',
- tz='US/Eastern'),
- 'tz_local': date_range(start='2000',
- periods=N,
- freq='s',
- tz=dateutil.tz.tzlocal()),
- 'tz_naive': date_range(start='2000',
- periods=N,
- freq='s')}
+ dtidxes = {
+ "dst": date_range(
+ start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S"
+ ),
+ "repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10),
+ "tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"),
+ "tz_local": date_range(
+ start="2000", periods=N, freq="s", tz=dateutil.tz.tzlocal()
+ ),
+ "tz_naive": date_range(start="2000", periods=N, freq="s"),
+ }
self.index = dtidxes[index_type]
def time_add_timedelta(self, index_type):
@@ -62,31 +58,31 @@ def time_to_pydatetime(self, index_type):
class TzLocalize:
- params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
- param_names = 'tz'
+ params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()]
+ param_names = "tz"
def setup(self, tz):
- dst_rng = date_range(start='10/29/2000 1:00:00',
- end='10/29/2000 1:59:59', freq='S')
- self.index = date_range(start='10/29/2000',
- end='10/29/2000 00:59:59', freq='S')
+ dst_rng = date_range(
+ start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S"
+ )
+ self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="S")
self.index = self.index.append(dst_rng)
self.index = self.index.append(dst_rng)
- self.index = self.index.append(date_range(start='10/29/2000 2:00:00',
- end='10/29/2000 3:00:00',
- freq='S'))
+ self.index = self.index.append(
+ date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="S")
+ )
def time_infer_dst(self, tz):
- self.index.tz_localize(tz, ambiguous='infer')
+ self.index.tz_localize(tz, ambiguous="infer")
class ResetIndex:
- params = [None, 'US/Eastern']
- param_names = 'tz'
+ params = [None, "US/Eastern"]
+ param_names = "tz"
def setup(self, tz):
- idx = date_range(start='1/1/2000', periods=1000, freq='H', tz=tz)
+ idx = date_range(start="1/1/2000", periods=1000, freq="H", tz=tz)
self.df = DataFrame(np.random.randn(1000, 2), index=idx)
def time_reest_datetimeindex(self, tz):
@@ -95,12 +91,12 @@ def time_reest_datetimeindex(self, tz):
class Factorize:
- params = [None, 'Asia/Tokyo']
- param_names = 'tz'
+ params = [None, "Asia/Tokyo"]
+ param_names = "tz"
def setup(self, tz):
N = 100000
- self.dti = date_range('2011-01-01', freq='H', periods=N, tz=tz)
+ self.dti = date_range("2011-01-01", freq="H", periods=N, tz=tz)
self.dti = self.dti.repeat(5)
def time_factorize(self, tz):
@@ -109,25 +105,24 @@ def time_factorize(self, tz):
class InferFreq:
- params = [None, 'D', 'B']
- param_names = ['freq']
+ params = [None, "D", "B"]
+ param_names = ["freq"]
def setup(self, freq):
if freq is None:
- self.idx = date_range(start='1/1/1700', freq='D', periods=10000)
+ self.idx = date_range(start="1/1/1700", freq="D", periods=10000)
self.idx.freq = None
else:
- self.idx = date_range(start='1/1/1700', freq=freq, periods=10000)
+ self.idx = date_range(start="1/1/1700", freq=freq, periods=10000)
def time_infer_freq(self, freq):
infer_freq(self.idx)
class TimeDatetimeConverter:
-
def setup(self):
N = 100000
- self.rng = date_range(start='1/1/2000', periods=N, freq='T')
+ self.rng = date_range(start="1/1/2000", periods=N, freq="T")
def time_convert(self):
DatetimeConverter.convert(self.rng, None, None)
@@ -136,11 +131,11 @@ def time_convert(self):
class Iteration:
params = [date_range, period_range]
- param_names = ['time_index']
+ param_names = ["time_index"]
def setup(self, time_index):
- N = 10**6
- self.idx = time_index(start='20140101', freq='T', periods=N)
+ N = 10 ** 6
+ self.idx = time_index(start="20140101", freq="T", periods=N)
self.exit = 10000
def time_iter(self, time_index):
@@ -155,13 +150,13 @@ def time_iter_preexit(self, time_index):
class ResampleDataFrame:
- params = ['max', 'mean', 'min']
- param_names = ['method']
+ params = ["max", "mean", "min"]
+ param_names = ["method"]
def setup(self, method):
- rng = date_range(start='20130101', periods=100000, freq='50L')
+ rng = date_range(start="20130101", periods=100000, freq="50L")
df = DataFrame(np.random.randn(100000, 2), index=rng)
- self.resample = getattr(df.resample('1s'), method)
+ self.resample = getattr(df.resample("1s"), method)
def time_method(self, method):
self.resample()
@@ -169,16 +164,14 @@ def time_method(self, method):
class ResampleSeries:
- params = (['period', 'datetime'], ['5min', '1D'], ['mean', 'ohlc'])
- param_names = ['index', 'freq', 'method']
+ params = (["period", "datetime"], ["5min", "1D"], ["mean", "ohlc"])
+ param_names = ["index", "freq", "method"]
def setup(self, index, freq, method):
- indexes = {'period': period_range(start='1/1/2000',
- end='1/1/2001',
- freq='T'),
- 'datetime': date_range(start='1/1/2000',
- end='1/1/2001',
- freq='T')}
+ indexes = {
+ "period": period_range(start="1/1/2000", end="1/1/2001", freq="T"),
+ "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="T"),
+ }
idx = indexes[index]
ts = Series(np.random.randn(len(idx)), index=idx)
self.resample = getattr(ts.resample(freq), method)
@@ -190,32 +183,35 @@ def time_resample(self, index, freq, method):
class ResampleDatetetime64:
# GH 7754
def setup(self):
- rng3 = date_range(start='2000-01-01 00:00:00',
- end='2000-01-01 10:00:00', freq='555000U')
- self.dt_ts = Series(5, rng3, dtype='datetime64[ns]')
+ rng3 = date_range(
+ start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000U"
+ )
+ self.dt_ts = Series(5, rng3, dtype="datetime64[ns]")
def time_resample(self):
- self.dt_ts.resample('1S').last()
+ self.dt_ts.resample("1S").last()
class AsOf:
- params = ['DataFrame', 'Series']
- param_names = ['constructor']
+ params = ["DataFrame", "Series"]
+ param_names = ["constructor"]
def setup(self, constructor):
N = 10000
M = 10
- rng = date_range(start='1/1/1990', periods=N, freq='53s')
- data = {'DataFrame': DataFrame(np.random.randn(N, M)),
- 'Series': Series(np.random.randn(N))}
+ rng = date_range(start="1/1/1990", periods=N, freq="53s")
+ data = {
+ "DataFrame": DataFrame(np.random.randn(N, M)),
+ "Series": Series(np.random.randn(N)),
+ }
self.ts = data[constructor]
self.ts.index = rng
self.ts2 = self.ts.copy()
self.ts2.iloc[250:5000] = np.nan
self.ts3 = self.ts.copy()
self.ts3.iloc[-5000:] = np.nan
- self.dates = date_range(start='1/1/1990', periods=N * 10, freq='5s')
+ self.dates = date_range(start="1/1/1990", periods=N * 10, freq="5s")
self.date = self.dates[0]
self.date_last = self.dates[-1]
self.date_early = self.date - timedelta(10)
@@ -248,11 +244,11 @@ def time_asof_nan_single(self, constructor):
class SortIndex:
params = [True, False]
- param_names = ['monotonic']
+ param_names = ["monotonic"]
def setup(self, monotonic):
- N = 10**5
- idx = date_range(start='1/1/2000', periods=N, freq='s')
+ N = 10 ** 5
+ idx = date_range(start="1/1/2000", periods=N, freq="s")
self.s = Series(np.random.randn(N), index=idx)
if not monotonic:
self.s = self.s.sample(frac=1)
@@ -265,10 +261,9 @@ def time_get_slice(self, monotonic):
class IrregularOps:
-
def setup(self):
- N = 10**5
- idx = date_range(start='1/1/2000', periods=N, freq='s')
+ N = 10 ** 5
+ idx = date_range(start="1/1/2000", periods=N, freq="s")
s = Series(np.random.randn(N), index=idx)
self.left = s.sample(frac=1)
self.right = s.sample(frac=1)
@@ -278,10 +273,9 @@ def time_add(self):
class Lookup:
-
def setup(self):
N = 1500000
- rng = date_range(start='1/1/2000', periods=N, freq='S')
+ rng = date_range(start="1/1/2000", periods=N, freq="S")
self.ts = Series(1, index=rng)
self.lookup_val = rng[N // 2]
@@ -291,36 +285,35 @@ def time_lookup_and_cleanup(self):
class ToDatetimeYYYYMMDD:
-
def setup(self):
- rng = date_range(start='1/1/2000', periods=10000, freq='D')
- self.stringsD = Series(rng.strftime('%Y%m%d'))
+ rng = date_range(start="1/1/2000", periods=10000, freq="D")
+ self.stringsD = Series(rng.strftime("%Y%m%d"))
def time_format_YYYYMMDD(self):
- to_datetime(self.stringsD, format='%Y%m%d')
+ to_datetime(self.stringsD, format="%Y%m%d")
class ToDatetimeCacheSmallCount(object):
params = ([True, False], [50, 500, 5000, 100000])
- param_names = ['cache', 'count']
+ param_names = ["cache", "count"]
def setup(self, cache, count):
- rng = date_range(start='1/1/1971', periods=count)
- self.unique_date_strings = rng.strftime('%Y-%m-%d').tolist()
+ rng = date_range(start="1/1/1971", periods=count)
+ self.unique_date_strings = rng.strftime("%Y-%m-%d").tolist()
def time_unique_date_strings(self, cache, count):
to_datetime(self.unique_date_strings, cache=cache)
class ToDatetimeISO8601:
-
def setup(self):
- rng = date_range(start='1/1/2000', periods=20000, freq='H')
- self.strings = rng.strftime('%Y-%m-%d %H:%M:%S').tolist()
- self.strings_nosep = rng.strftime('%Y%m%d %H:%M:%S').tolist()
- self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800'
- for x in rng]
+ rng = date_range(start="1/1/2000", periods=20000, freq="H")
+ self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist()
+ self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist()
+ self.strings_tz_space = [
+ x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng
+ ]
def time_iso8601(self):
to_datetime(self.strings)
@@ -329,22 +322,21 @@ def time_iso8601_nosep(self):
to_datetime(self.strings_nosep)
def time_iso8601_format(self):
- to_datetime(self.strings, format='%Y-%m-%d %H:%M:%S')
+ to_datetime(self.strings, format="%Y-%m-%d %H:%M:%S")
def time_iso8601_format_no_sep(self):
- to_datetime(self.strings_nosep, format='%Y%m%d %H:%M:%S')
+ to_datetime(self.strings_nosep, format="%Y%m%d %H:%M:%S")
def time_iso8601_tz_spaceformat(self):
to_datetime(self.strings_tz_space)
class ToDatetimeNONISO8601:
-
def setup(self):
N = 10000
half = int(N / 2)
- ts_string_1 = 'March 1, 2018 12:00:00+0400'
- ts_string_2 = 'March 1, 2018 12:00:00+0500'
+ ts_string_1 = "March 1, 2018 12:00:00+0400"
+ ts_string_2 = "March 1, 2018 12:00:00+0500"
self.same_offset = [ts_string_1] * N
self.diff_offset = [ts_string_1] * half + [ts_string_2] * half
@@ -356,50 +348,48 @@ def time_different_offset(self):
class ToDatetimeFormatQuarters:
-
def setup(self):
- self.s = Series(['2Q2005', '2Q05', '2005Q1', '05Q1'] * 10000)
+ self.s = Series(["2Q2005", "2Q05", "2005Q1", "05Q1"] * 10000)
def time_infer_quarter(self):
to_datetime(self.s)
class ToDatetimeFormat:
-
def setup(self):
- self.s = Series(['19MAY11', '19MAY11:00:00:00'] * 100000)
- self.s2 = self.s.str.replace(':\\S+$', '')
+ self.s = Series(["19MAY11", "19MAY11:00:00:00"] * 100000)
+ self.s2 = self.s.str.replace(":\\S+$", "")
def time_exact(self):
- to_datetime(self.s2, format='%d%b%y')
+ to_datetime(self.s2, format="%d%b%y")
def time_no_exact(self):
- to_datetime(self.s, format='%d%b%y', exact=False)
+ to_datetime(self.s, format="%d%b%y", exact=False)
class ToDatetimeCache:
params = [True, False]
- param_names = ['cache']
+ param_names = ["cache"]
def setup(self, cache):
N = 10000
self.unique_numeric_seconds = list(range(N))
self.dup_numeric_seconds = [1000] * N
- self.dup_string_dates = ['2000-02-11'] * N
- self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * N
+ self.dup_string_dates = ["2000-02-11"] * N
+ self.dup_string_with_tz = ["2000-02-11 15:00:00-0800"] * N
def time_unique_seconds_and_unit(self, cache):
- to_datetime(self.unique_numeric_seconds, unit='s', cache=cache)
+ to_datetime(self.unique_numeric_seconds, unit="s", cache=cache)
def time_dup_seconds_and_unit(self, cache):
- to_datetime(self.dup_numeric_seconds, unit='s', cache=cache)
+ to_datetime(self.dup_numeric_seconds, unit="s", cache=cache)
def time_dup_string_dates(self, cache):
to_datetime(self.dup_string_dates, cache=cache)
def time_dup_string_dates_and_format(self, cache):
- to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=cache)
+ to_datetime(self.dup_string_dates, format="%Y-%m-%d", cache=cache)
def time_dup_string_tzoffset_dates(self, cache):
to_datetime(self.dup_string_with_tz, cache=cache)
@@ -407,14 +397,12 @@ def time_dup_string_tzoffset_dates(self, cache):
class DatetimeAccessor:
- params = [None, 'US/Eastern', 'UTC', dateutil.tz.tzutc()]
- param_names = 'tz'
+ params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()]
+ param_names = "tz"
def setup(self, tz):
N = 100000
- self.series = Series(
- date_range(start='1/1/2000', periods=N, freq='T', tz=tz)
- )
+ self.series = Series(date_range(start="1/1/2000", periods=N, freq="T", tz=tz))
def time_dt_accessor(self, tz):
self.series.dt
diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py
index c6e56804c7b21..8ebb2d8d2f35d 100644
--- a/asv_bench/benchmarks/timestamp.py
+++ b/asv_bench/benchmarks/timestamp.py
@@ -7,21 +7,20 @@
class TimestampConstruction:
-
def time_parse_iso8601_no_tz(self):
- Timestamp('2017-08-25 08:16:14')
+ Timestamp("2017-08-25 08:16:14")
def time_parse_iso8601_tz(self):
- Timestamp('2017-08-25 08:16:14-0500')
+ Timestamp("2017-08-25 08:16:14-0500")
def time_parse_dateutil(self):
- Timestamp('2017/08/25 08:16:14 AM')
+ Timestamp("2017/08/25 08:16:14 AM")
def time_parse_today(self):
- Timestamp('today')
+ Timestamp("today")
def time_parse_now(self):
- Timestamp('now')
+ Timestamp("now")
def time_fromordinal(self):
Timestamp.fromordinal(730120)
@@ -31,14 +30,13 @@ def time_fromtimestamp(self):
class TimestampProperties:
- _tzs = [None, pytz.timezone('Europe/Amsterdam'), pytz.UTC,
- dateutil.tz.tzutc()]
- _freqs = [None, 'B']
+ _tzs = [None, pytz.timezone("Europe/Amsterdam"), pytz.UTC, dateutil.tz.tzutc()]
+ _freqs = [None, "B"]
params = [_tzs, _freqs]
- param_names = ['tz', 'freq']
+ param_names = ["tz", "freq"]
def setup(self, tz, freq):
- self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz, freq=freq)
+ self.ts = Timestamp("2017-08-25 08:16:14", tzinfo=tz, freq=freq)
def time_tz(self, tz, freq):
self.ts.tz
@@ -93,15 +91,14 @@ def time_month_name(self, tz, freq):
class TimestampOps:
- params = [None, 'US/Eastern', pytz.UTC,
- dateutil.tz.tzutc()]
- param_names = ['tz']
+ params = [None, "US/Eastern", pytz.UTC, dateutil.tz.tzutc()]
+ param_names = ["tz"]
def setup(self, tz):
- self.ts = Timestamp('2017-08-25 08:16:14', tz=tz)
+ self.ts = Timestamp("2017-08-25 08:16:14", tz=tz)
def time_replace_tz(self, tz):
- self.ts.replace(tzinfo=pytz.timezone('US/Eastern'))
+ self.ts.replace(tzinfo=pytz.timezone("US/Eastern"))
def time_replace_None(self, tz):
self.ts.replace(tzinfo=None)
@@ -124,16 +121,16 @@ def time_to_julian_date(self, tz):
self.ts.to_julian_date()
def time_floor(self, tz):
- self.ts.floor('5T')
+ self.ts.floor("5T")
def time_ceil(self, tz):
- self.ts.ceil('5T')
+ self.ts.ceil("5T")
class TimestampAcrossDst:
def setup(self):
dt = datetime.datetime(2016, 3, 27, 1)
- self.tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo
+ self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo
self.ts2 = Timestamp(dt)
def time_replace_across_dst(self):
diff --git a/ci/print_skipped.py b/ci/print_skipped.py
index 859481c5d188d..a44281044e11d 100755
--- a/ci/print_skipped.py
+++ b/ci/print_skipped.py
@@ -11,45 +11,42 @@ def parse_results(filename):
root = tree.getroot()
skipped = []
- current_class = ''
+ current_class = ""
i = 1
assert i - 1 == len(skipped)
- for el in root.findall('testcase'):
- cn = el.attrib['classname']
- for sk in el.findall('skipped'):
+ for el in root.findall("testcase"):
+ cn = el.attrib["classname"]
+ for sk in el.findall("skipped"):
old_class = current_class
current_class = cn
- name = '{classname}.{name}'.format(classname=current_class,
- name=el.attrib['name'])
- msg = sk.attrib['message']
- out = ''
+ name = "{classname}.{name}".format(
+ classname=current_class, name=el.attrib["name"]
+ )
+ msg = sk.attrib["message"]
+ out = ""
if old_class != current_class:
ndigits = int(math.log(i, 10) + 1)
# 4 for : + space + # + space
- out += ('-' * (len(name + msg) + 4 + ndigits) + '\n')
- out += '#{i} {name}: {msg}'.format(i=i, name=name, msg=msg)
+ out += "-" * (len(name + msg) + 4 + ndigits) + "\n"
+ out += "#{i} {name}: {msg}".format(i=i, name=name, msg=msg)
skipped.append(out)
i += 1
assert i - 1 == len(skipped)
assert i - 1 == len(skipped)
# assert len(skipped) == int(root.attrib['skip'])
- return '\n'.join(skipped)
+ return "\n".join(skipped)
def main():
- test_files = [
- 'test-data-single.xml',
- 'test-data-multiple.xml',
- 'test-data.xml',
- ]
+ test_files = ["test-data-single.xml", "test-data-multiple.xml", "test-data.xml"]
- print('SKIPPED TESTS:')
+ print("SKIPPED TESTS:")
for fn in test_files:
if os.path.isfile(fn):
print(parse_results(fn))
return 0
-if __name__ == '__main__':
+if __name__ == "__main__":
sys.exit(main())
diff --git a/doc/logo/pandas_logo.py b/doc/logo/pandas_logo.py
index c3647f0c7d2a8..5a07b094e6ad3 100644
--- a/doc/logo/pandas_logo.py
+++ b/doc/logo/pandas_logo.py
@@ -4,7 +4,7 @@
from matplotlib import rcParams
import numpy as np
-rcParams['mathtext.fontset'] = 'cm'
+rcParams["mathtext.fontset"] = "cm"
def fnx():
@@ -37,8 +37,12 @@ def fnx():
plt.figtext(0.05, 0.5, "pandas", size=40)
plt.figtext(
- 0.05, 0.2, r"$y_{it} = \beta^{\prime} x_{it} + \mu_{i} + \epsilon_{it}$",
- size=16, color="#5a89a4")
-
-fig.savefig('pandas_logo.svg')
-fig.savefig('pandas_logo.png')
+ 0.05,
+ 0.2,
+ r"$y_{it} = \beta^{\prime} x_{it} + \mu_{i} + \epsilon_{it}$",
+ size=16,
+ color="#5a89a4",
+)
+
+fig.savefig("pandas_logo.svg")
+fig.savefig("pandas_logo.png")
diff --git a/doc/make.py b/doc/make.py
index 496b3cfd4ee45..48febef20fbe6 100755
--- a/doc/make.py
+++ b/doc/make.py
@@ -24,9 +24,9 @@
DOC_PATH = os.path.dirname(os.path.abspath(__file__))
-SOURCE_PATH = os.path.join(DOC_PATH, 'source')
-BUILD_PATH = os.path.join(DOC_PATH, 'build')
-REDIRECTS_FILE = os.path.join(DOC_PATH, 'redirects.csv')
+SOURCE_PATH = os.path.join(DOC_PATH, "source")
+BUILD_PATH = os.path.join(DOC_PATH, "build")
+REDIRECTS_FILE = os.path.join(DOC_PATH, "redirects.csv")
class DocBuilder:
@@ -36,8 +36,15 @@ class DocBuilder:
All public methods of this class can be called as parameters of the
script.
"""
- def __init__(self, num_jobs=0, include_api=True, single_doc=None,
- verbosity=0, warnings_are_errors=False):
+
+ def __init__(
+ self,
+ num_jobs=0,
+ include_api=True,
+ single_doc=None,
+ verbosity=0,
+ warnings_are_errors=False,
+ ):
self.num_jobs = num_jobs
self.verbosity = verbosity
self.warnings_are_errors = warnings_are_errors
@@ -45,16 +52,15 @@ def __init__(self, num_jobs=0, include_api=True, single_doc=None,
if single_doc:
single_doc = self._process_single_doc(single_doc)
include_api = False
- os.environ['SPHINX_PATTERN'] = single_doc
+ os.environ["SPHINX_PATTERN"] = single_doc
elif not include_api:
- os.environ['SPHINX_PATTERN'] = '-api'
+ os.environ["SPHINX_PATTERN"] = "-api"
self.single_doc_html = None
- if single_doc and single_doc.endswith('.rst'):
- self.single_doc_html = os.path.splitext(single_doc)[0] + '.html'
+ if single_doc and single_doc.endswith(".rst"):
+ self.single_doc_html = os.path.splitext(single_doc)[0] + ".html"
elif single_doc:
- self.single_doc_html = 'reference/api/pandas.{}.html'.format(
- single_doc)
+ self.single_doc_html = "reference/api/pandas.{}.html".format(single_doc)
def _process_single_doc(self, single_doc):
"""
@@ -66,26 +72,30 @@ def _process_single_doc(self, single_doc):
(e.g. reference/api/pandas.DataFrame.head.rst).
"""
base_name, extension = os.path.splitext(single_doc)
- if extension in ('.rst', '.ipynb'):
+ if extension in (".rst", ".ipynb"):
if os.path.exists(os.path.join(SOURCE_PATH, single_doc)):
return single_doc
else:
- raise FileNotFoundError('File {} not found'.format(single_doc))
+ raise FileNotFoundError("File {} not found".format(single_doc))
- elif single_doc.startswith('pandas.'):
+ elif single_doc.startswith("pandas."):
try:
obj = pandas # noqa: F821
- for name in single_doc.split('.'):
+ for name in single_doc.split("."):
obj = getattr(obj, name)
except AttributeError:
- raise ImportError('Could not import {}'.format(single_doc))
+ raise ImportError("Could not import {}".format(single_doc))
else:
- return single_doc[len('pandas.'):]
+ return single_doc[len("pandas.") :]
else:
- raise ValueError(('--single={} not understood. Value should be a '
- 'valid path to a .rst or .ipynb file, or a '
- 'valid pandas object (e.g. categorical.rst or '
- 'pandas.DataFrame.head)').format(single_doc))
+ raise ValueError(
+ (
+ "--single={} not understood. Value should be a "
+ "valid path to a .rst or .ipynb file, or a "
+ "valid pandas object (e.g. categorical.rst or "
+ "pandas.DataFrame.head)"
+ ).format(single_doc)
+ )
@staticmethod
def _run_os(*args):
@@ -117,52 +127,55 @@ def _sphinx_build(self, kind):
--------
>>> DocBuilder(num_jobs=4)._sphinx_build('html')
"""
- if kind not in ('html', 'latex'):
- raise ValueError('kind must be html or latex, '
- 'not {}'.format(kind))
+ if kind not in ("html", "latex"):
+ raise ValueError("kind must be html or latex, " "not {}".format(kind))
- cmd = ['sphinx-build', '-b', kind]
+ cmd = ["sphinx-build", "-b", kind]
if self.num_jobs:
- cmd += ['-j', str(self.num_jobs)]
+ cmd += ["-j", str(self.num_jobs)]
if self.warnings_are_errors:
- cmd += ['-W', '--keep-going']
+ cmd += ["-W", "--keep-going"]
if self.verbosity:
- cmd.append('-{}'.format('v' * self.verbosity))
- cmd += ['-d', os.path.join(BUILD_PATH, 'doctrees'),
- SOURCE_PATH, os.path.join(BUILD_PATH, kind)]
+ cmd.append("-{}".format("v" * self.verbosity))
+ cmd += [
+ "-d",
+ os.path.join(BUILD_PATH, "doctrees"),
+ SOURCE_PATH,
+ os.path.join(BUILD_PATH, kind),
+ ]
return subprocess.call(cmd)
def _open_browser(self, single_doc_html):
"""
Open a browser tab showing single
"""
- url = os.path.join('file://', DOC_PATH, 'build', 'html',
- single_doc_html)
+ url = os.path.join("file://", DOC_PATH, "build", "html", single_doc_html)
webbrowser.open(url, new=2)
def _get_page_title(self, page):
"""
Open the rst file `page` and extract its title.
"""
- fname = os.path.join(SOURCE_PATH, '{}.rst'.format(page))
+ fname = os.path.join(SOURCE_PATH, "{}.rst".format(page))
option_parser = docutils.frontend.OptionParser(
- components=(docutils.parsers.rst.Parser,))
- doc = docutils.utils.new_document(
- '',
- option_parser.get_default_values())
+ components=(docutils.parsers.rst.Parser,)
+ )
+ doc = docutils.utils.new_document("", option_parser.get_default_values())
with open(fname) as f:
data = f.read()
parser = docutils.parsers.rst.Parser()
# do not generate any warning when parsing the rst
- with open(os.devnull, 'a') as f:
+ with open(os.devnull, "a") as f:
doc.reporter.stream = f
parser.parse(data, doc)
- section = next(node for node in doc.children
- if isinstance(node, docutils.nodes.section))
- title = next(node for node in section.children
- if isinstance(node, docutils.nodes.title))
+ section = next(
+ node for node in doc.children if isinstance(node, docutils.nodes.section)
+ )
+ title = next(
+ node for node in section.children if isinstance(node, docutils.nodes.title)
+ )
return title.astext()
@@ -171,7 +184,7 @@ def _add_redirects(self):
Create in the build directory an html file with a redirect,
for every row in REDIRECTS_FILE.
"""
- html = '''
+ html = """
@@ -182,16 +195,14 @@ def _add_redirects(self):
- '''
+ """
with open(REDIRECTS_FILE) as mapping_fd:
reader = csv.reader(mapping_fd)
for row in reader:
- if not row or row[0].strip().startswith('#'):
+ if not row or row[0].strip().startswith("#"):
continue
- path = os.path.join(BUILD_PATH,
- 'html',
- *row[0].split('/')) + '.html'
+ path = os.path.join(BUILD_PATH, "html", *row[0].split("/")) + ".html"
try:
title = self._get_page_title(row[1])
@@ -199,24 +210,26 @@ def _add_redirects(self):
# the file can be an ipynb and not an rst, or docutils
# may not be able to read the rst because it has some
# sphinx specific stuff
- title = 'this page'
+ title = "this page"
if os.path.exists(path):
- raise RuntimeError((
- 'Redirection would overwrite an existing file: '
- '{}').format(path))
+ raise RuntimeError(
+ ("Redirection would overwrite an existing file: " "{}").format(
+ path
+ )
+ )
- with open(path, 'w') as moved_page_fd:
+ with open(path, "w") as moved_page_fd:
moved_page_fd.write(
- html.format(url='{}.html'.format(row[1]),
- title=title))
+ html.format(url="{}.html".format(row[1]), title=title)
+ )
def html(self):
"""
Build HTML documentation.
"""
- ret_code = self._sphinx_build('html')
- zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip')
+ ret_code = self._sphinx_build("html")
+ zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip")
if os.path.exists(zip_fname):
os.remove(zip_fname)
@@ -231,20 +244,20 @@ def latex(self, force=False):
"""
Build PDF documentation.
"""
- if sys.platform == 'win32':
- sys.stderr.write('latex build has not been tested on windows\n')
+ if sys.platform == "win32":
+ sys.stderr.write("latex build has not been tested on windows\n")
else:
- ret_code = self._sphinx_build('latex')
- os.chdir(os.path.join(BUILD_PATH, 'latex'))
+ ret_code = self._sphinx_build("latex")
+ os.chdir(os.path.join(BUILD_PATH, "latex"))
if force:
for i in range(3):
- self._run_os('pdflatex',
- '-interaction=nonstopmode',
- 'pandas.tex')
- raise SystemExit('You should check the file '
- '"build/latex/pandas.pdf" for problems.')
+ self._run_os("pdflatex", "-interaction=nonstopmode", "pandas.tex")
+ raise SystemExit(
+ "You should check the file "
+ '"build/latex/pandas.pdf" for problems.'
+ )
else:
- self._run_os('make')
+ self._run_os("make")
return ret_code
def latex_forced(self):
@@ -259,84 +272,101 @@ def clean():
Clean documentation generated files.
"""
shutil.rmtree(BUILD_PATH, ignore_errors=True)
- shutil.rmtree(os.path.join(SOURCE_PATH, 'reference', 'api'),
- ignore_errors=True)
+ shutil.rmtree(os.path.join(SOURCE_PATH, "reference", "api"), ignore_errors=True)
def zip_html(self):
"""
Compress HTML documentation into a zip file.
"""
- zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip')
+ zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip")
if os.path.exists(zip_fname):
os.remove(zip_fname)
- dirname = os.path.join(BUILD_PATH, 'html')
+ dirname = os.path.join(BUILD_PATH, "html")
fnames = os.listdir(dirname)
os.chdir(dirname)
- self._run_os('zip',
- zip_fname,
- '-r',
- '-q',
- *fnames)
+ self._run_os("zip", zip_fname, "-r", "-q", *fnames)
def main():
- cmds = [method for method in dir(DocBuilder) if not method.startswith('_')]
+ cmds = [method for method in dir(DocBuilder) if not method.startswith("_")]
argparser = argparse.ArgumentParser(
- description='pandas documentation builder',
- epilog='Commands: {}'.format(','.join(cmds)))
- argparser.add_argument('command',
- nargs='?',
- default='html',
- help='command to run: {}'.format(', '.join(cmds)))
- argparser.add_argument('--num-jobs',
- type=int,
- default=0,
- help='number of jobs used by sphinx-build')
- argparser.add_argument('--no-api',
- default=False,
- help='omit api and autosummary',
- action='store_true')
- argparser.add_argument('--single',
- metavar='FILENAME',
- type=str,
- default=None,
- help=('filename (relative to the "source" folder)'
- ' of section or method name to compile, e.g. '
- '"development/contributing.rst",'
- ' "ecosystem.rst", "pandas.DataFrame.join"'))
- argparser.add_argument('--python-path',
- type=str,
- default=os.path.dirname(DOC_PATH),
- help='path')
- argparser.add_argument('-v', action='count', dest='verbosity', default=0,
- help=('increase verbosity (can be repeated), '
- 'passed to the sphinx build command'))
- argparser.add_argument('--warnings-are-errors', '-W',
- action='store_true',
- help='fail if warnings are raised')
+ description="pandas documentation builder",
+ epilog="Commands: {}".format(",".join(cmds)),
+ )
+ argparser.add_argument(
+ "command",
+ nargs="?",
+ default="html",
+ help="command to run: {}".format(", ".join(cmds)),
+ )
+ argparser.add_argument(
+ "--num-jobs", type=int, default=0, help="number of jobs used by sphinx-build"
+ )
+ argparser.add_argument(
+ "--no-api", default=False, help="omit api and autosummary", action="store_true"
+ )
+ argparser.add_argument(
+ "--single",
+ metavar="FILENAME",
+ type=str,
+ default=None,
+ help=(
+ 'filename (relative to the "source" folder)'
+ " of section or method name to compile, e.g. "
+ '"development/contributing.rst",'
+ ' "ecosystem.rst", "pandas.DataFrame.join"'
+ ),
+ )
+ argparser.add_argument(
+ "--python-path", type=str, default=os.path.dirname(DOC_PATH), help="path"
+ )
+ argparser.add_argument(
+ "-v",
+ action="count",
+ dest="verbosity",
+ default=0,
+ help=(
+ "increase verbosity (can be repeated), "
+ "passed to the sphinx build command"
+ ),
+ )
+ argparser.add_argument(
+ "--warnings-are-errors",
+ "-W",
+ action="store_true",
+ help="fail if warnings are raised",
+ )
args = argparser.parse_args()
if args.command not in cmds:
- raise ValueError('Unknown command {}. Available options: {}'.format(
- args.command, ', '.join(cmds)))
+ raise ValueError(
+ "Unknown command {}. Available options: {}".format(
+ args.command, ", ".join(cmds)
+ )
+ )
# Below we update both os.environ and sys.path. The former is used by
# external libraries (namely Sphinx) to compile this module and resolve
# the import of `python_path` correctly. The latter is used to resolve
# the import within the module, injecting it into the global namespace
- os.environ['PYTHONPATH'] = args.python_path
+ os.environ["PYTHONPATH"] = args.python_path
sys.path.insert(0, args.python_path)
- globals()['pandas'] = importlib.import_module('pandas')
+ globals()["pandas"] = importlib.import_module("pandas")
# Set the matplotlib backend to the non-interactive Agg backend for all
# child processes.
- os.environ['MPLBACKEND'] = 'module://matplotlib.backends.backend_agg'
-
- builder = DocBuilder(args.num_jobs, not args.no_api, args.single,
- args.verbosity, args.warnings_are_errors)
+ os.environ["MPLBACKEND"] = "module://matplotlib.backends.backend_agg"
+
+ builder = DocBuilder(
+ args.num_jobs,
+ not args.no_api,
+ args.single,
+ args.verbosity,
+ args.warnings_are_errors,
+ )
return getattr(builder, args.command)()
-if __name__ == '__main__':
+if __name__ == "__main__":
sys.exit(main())
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 2484a9d592e09..3ebc5d8b6333b 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -34,15 +34,13 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
# sys.path.append(os.path.abspath('.'))
-sys.path.insert(0, os.path.abspath('../sphinxext'))
-sys.path.extend([
-
- # numpy standard doc extensions
- os.path.join(os.path.dirname(__file__),
- '..', '../..',
- 'sphinxext')
-
-])
+sys.path.insert(0, os.path.abspath("../sphinxext"))
+sys.path.extend(
+ [
+ # numpy standard doc extensions
+ os.path.join(os.path.dirname(__file__), "..", "../..", "sphinxext")
+ ]
+)
# -- General configuration -----------------------------------------------
@@ -50,65 +48,66 @@
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
# sphinxext.
-extensions = ['sphinx.ext.autodoc',
- 'sphinx.ext.autosummary',
- 'sphinx.ext.doctest',
- 'sphinx.ext.extlinks',
- 'sphinx.ext.todo',
- 'numpydoc', # handle NumPy documentation formatted docstrings
- 'IPython.sphinxext.ipython_directive',
- 'IPython.sphinxext.ipython_console_highlighting',
- 'matplotlib.sphinxext.plot_directive',
- 'sphinx.ext.intersphinx',
- 'sphinx.ext.coverage',
- 'sphinx.ext.mathjax',
- 'sphinx.ext.ifconfig',
- 'sphinx.ext.linkcode',
- 'nbsphinx',
- 'contributors', # custom pandas extension
- ]
-
-exclude_patterns = ['**.ipynb_checkpoints']
+extensions = [
+ "sphinx.ext.autodoc",
+ "sphinx.ext.autosummary",
+ "sphinx.ext.doctest",
+ "sphinx.ext.extlinks",
+ "sphinx.ext.todo",
+ "numpydoc", # handle NumPy documentation formatted docstrings
+ "IPython.sphinxext.ipython_directive",
+ "IPython.sphinxext.ipython_console_highlighting",
+ "matplotlib.sphinxext.plot_directive",
+ "sphinx.ext.intersphinx",
+ "sphinx.ext.coverage",
+ "sphinx.ext.mathjax",
+ "sphinx.ext.ifconfig",
+ "sphinx.ext.linkcode",
+ "nbsphinx",
+ "contributors", # custom pandas extension
+]
+
+exclude_patterns = ["**.ipynb_checkpoints"]
try:
import nbconvert
except ImportError:
- logger.warn('nbconvert not installed. Skipping notebooks.')
- exclude_patterns.append('**/*.ipynb')
+ logger.warn("nbconvert not installed. Skipping notebooks.")
+ exclude_patterns.append("**/*.ipynb")
else:
try:
nbconvert.utils.pandoc.get_pandoc_version()
except nbconvert.utils.pandoc.PandocMissing:
- logger.warn('Pandoc not installed. Skipping notebooks.')
- exclude_patterns.append('**/*.ipynb')
+ logger.warn("Pandoc not installed. Skipping notebooks.")
+ exclude_patterns.append("**/*.ipynb")
# sphinx_pattern can be '-api' to exclude the API pages,
# the path to a file, or a Python object
# (e.g. '10min.rst' or 'pandas.DataFrame.head')
source_path = os.path.dirname(os.path.abspath(__file__))
-pattern = os.environ.get('SPHINX_PATTERN')
+pattern = os.environ.get("SPHINX_PATTERN")
if pattern:
for dirname, dirs, fnames in os.walk(source_path):
for fname in fnames:
- if os.path.splitext(fname)[-1] in ('.rst', '.ipynb'):
- fname = os.path.relpath(os.path.join(dirname, fname),
- source_path)
+ if os.path.splitext(fname)[-1] in (".rst", ".ipynb"):
+ fname = os.path.relpath(os.path.join(dirname, fname), source_path)
- if (fname == 'index.rst'
- and os.path.abspath(dirname) == source_path):
+ if fname == "index.rst" and os.path.abspath(dirname) == source_path:
continue
- elif pattern == '-api' and dirname == 'reference':
+ elif pattern == "-api" and dirname == "reference":
exclude_patterns.append(fname)
- elif pattern != '-api' and fname != pattern:
+ elif pattern != "-api" and fname != pattern:
exclude_patterns.append(fname)
-with open(os.path.join(source_path, 'index.rst.template')) as f:
+with open(os.path.join(source_path, "index.rst.template")) as f:
t = jinja2.Template(f.read())
-with open(os.path.join(source_path, 'index.rst'), 'w') as f:
- f.write(t.render(include_api=pattern is None,
- single_doc=(pattern
- if pattern is not None and pattern != '-api'
- else None)))
-autosummary_generate = True if pattern is None else ['index']
+with open(os.path.join(source_path, "index.rst"), "w") as f:
+ f.write(
+ t.render(
+ include_api=pattern is None,
+ single_doc=(pattern if pattern is not None and pattern != "-api" else None),
+ )
+ )
+autosummary_generate = True if pattern is None else ["index"]
# numpydoc
numpydoc_attributes_as_param_list = False
@@ -122,22 +121,20 @@
import pandas as pd"""
# Add any paths that contain templates here, relative to this directory.
-templates_path = ['../_templates']
+templates_path = ["../_templates"]
# The suffix of source filenames.
-source_suffix = [
- '.rst',
-]
+source_suffix = [".rst"]
# The encoding of source files.
-source_encoding = 'utf-8'
+source_encoding = "utf-8"
# The master toctree document.
-master_doc = 'index'
+master_doc = "index"
# General information about the project.
-project = 'pandas'
-copyright = '2008-2014, the pandas development team'
+project = "pandas"
+copyright = "2008-2014, the pandas development team"
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@@ -184,7 +181,7 @@
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []
@@ -194,7 +191,7 @@
# The theme to use for HTML and HTML Help pages. Major themes that come with
# Sphinx are currently 'default' and 'sphinxdoc'.
-html_theme = 'nature_with_gtoc'
+html_theme = "nature_with_gtoc"
# The style sheet to use for HTML and HTML Help pages. A file of that name
# must exist either in Sphinx' static/ path, or in one of the custom paths
@@ -207,7 +204,7 @@
# html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
-html_theme_path = ['themes']
+html_theme_path = ["themes"]
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
@@ -223,12 +220,12 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
-html_favicon = os.path.join(html_static_path[0], 'favicon.ico')
+html_favicon = os.path.join(html_static_path[0], "favicon.ico")
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
@@ -250,60 +247,62 @@
# https://github.com/pandas-dev/pandas/issues/16186
moved_api_pages = [
- ('pandas.core.common.isnull', 'pandas.isna'),
- ('pandas.core.common.notnull', 'pandas.notna'),
- ('pandas.core.reshape.get_dummies', 'pandas.get_dummies'),
- ('pandas.tools.merge.concat', 'pandas.concat'),
- ('pandas.tools.merge.merge', 'pandas.merge'),
- ('pandas.tools.pivot.pivot_table', 'pandas.pivot_table'),
- ('pandas.tseries.tools.to_datetime', 'pandas.to_datetime'),
- ('pandas.io.clipboard.read_clipboard', 'pandas.read_clipboard'),
- ('pandas.io.excel.ExcelFile.parse', 'pandas.ExcelFile.parse'),
- ('pandas.io.excel.read_excel', 'pandas.read_excel'),
- ('pandas.io.gbq.read_gbq', 'pandas.read_gbq'),
- ('pandas.io.html.read_html', 'pandas.read_html'),
- ('pandas.io.json.read_json', 'pandas.read_json'),
- ('pandas.io.parsers.read_csv', 'pandas.read_csv'),
- ('pandas.io.parsers.read_fwf', 'pandas.read_fwf'),
- ('pandas.io.parsers.read_table', 'pandas.read_table'),
- ('pandas.io.pickle.read_pickle', 'pandas.read_pickle'),
- ('pandas.io.pytables.HDFStore.append', 'pandas.HDFStore.append'),
- ('pandas.io.pytables.HDFStore.get', 'pandas.HDFStore.get'),
- ('pandas.io.pytables.HDFStore.put', 'pandas.HDFStore.put'),
- ('pandas.io.pytables.HDFStore.select', 'pandas.HDFStore.select'),
- ('pandas.io.pytables.read_hdf', 'pandas.read_hdf'),
- ('pandas.io.sql.read_sql', 'pandas.read_sql'),
- ('pandas.io.sql.read_frame', 'pandas.read_frame'),
- ('pandas.io.sql.write_frame', 'pandas.write_frame'),
- ('pandas.io.stata.read_stata', 'pandas.read_stata'),
+ ("pandas.core.common.isnull", "pandas.isna"),
+ ("pandas.core.common.notnull", "pandas.notna"),
+ ("pandas.core.reshape.get_dummies", "pandas.get_dummies"),
+ ("pandas.tools.merge.concat", "pandas.concat"),
+ ("pandas.tools.merge.merge", "pandas.merge"),
+ ("pandas.tools.pivot.pivot_table", "pandas.pivot_table"),
+ ("pandas.tseries.tools.to_datetime", "pandas.to_datetime"),
+ ("pandas.io.clipboard.read_clipboard", "pandas.read_clipboard"),
+ ("pandas.io.excel.ExcelFile.parse", "pandas.ExcelFile.parse"),
+ ("pandas.io.excel.read_excel", "pandas.read_excel"),
+ ("pandas.io.gbq.read_gbq", "pandas.read_gbq"),
+ ("pandas.io.html.read_html", "pandas.read_html"),
+ ("pandas.io.json.read_json", "pandas.read_json"),
+ ("pandas.io.parsers.read_csv", "pandas.read_csv"),
+ ("pandas.io.parsers.read_fwf", "pandas.read_fwf"),
+ ("pandas.io.parsers.read_table", "pandas.read_table"),
+ ("pandas.io.pickle.read_pickle", "pandas.read_pickle"),
+ ("pandas.io.pytables.HDFStore.append", "pandas.HDFStore.append"),
+ ("pandas.io.pytables.HDFStore.get", "pandas.HDFStore.get"),
+ ("pandas.io.pytables.HDFStore.put", "pandas.HDFStore.put"),
+ ("pandas.io.pytables.HDFStore.select", "pandas.HDFStore.select"),
+ ("pandas.io.pytables.read_hdf", "pandas.read_hdf"),
+ ("pandas.io.sql.read_sql", "pandas.read_sql"),
+ ("pandas.io.sql.read_frame", "pandas.read_frame"),
+ ("pandas.io.sql.write_frame", "pandas.write_frame"),
+ ("pandas.io.stata.read_stata", "pandas.read_stata"),
]
# Again, tuples of (from_old, to_new)
moved_classes = [
- ('pandas.tseries.resample.Resampler', 'pandas.core.resample.Resampler'),
- ('pandas.formats.style.Styler', 'pandas.io.formats.style.Styler'),
+ ("pandas.tseries.resample.Resampler", "pandas.core.resample.Resampler"),
+ ("pandas.formats.style.Styler", "pandas.io.formats.style.Styler"),
]
for old, new in moved_classes:
# the class itself...
moved_api_pages.append((old, new))
- mod, classname = new.rsplit('.', 1)
+ mod, classname = new.rsplit(".", 1)
klass = getattr(importlib.import_module(mod), classname)
- methods = [x for x in dir(klass)
- if not x.startswith('_') or x in ('__iter__', '__array__')]
+ methods = [
+ x for x in dir(klass) if not x.startswith("_") or x in ("__iter__", "__array__")
+ ]
for method in methods:
# ... and each of its public methods
moved_api_pages.append(
- ("{old}.{method}".format(old=old, method=method),
- "{new}.{method}".format(new=new, method=method))
+ (
+ "{old}.{method}".format(old=old, method=method),
+ "{new}.{method}".format(new=new, method=method),
+ )
)
if pattern is None:
html_additional_pages = {
- 'generated/' + page[0]: 'api_redirect.html'
- for page in moved_api_pages
+ "generated/" + page[0]: "api_redirect.html" for page in moved_api_pages
}
@@ -323,12 +322,14 @@
import os
os.chdir(r'{}')
-""".format(os.path.dirname(os.path.dirname(__file__)))
+""".format(
+ os.path.dirname(os.path.dirname(__file__))
+)
html_context = {
- 'redirects': {old: new for old, new in moved_api_pages},
- 'header': header
+ "redirects": {old: new for old, new in moved_api_pages},
+ "header": header,
}
# If false, no module index is generated.
@@ -352,7 +353,7 @@
# html_file_suffix = ''
# Output file base name for HTML help builder.
-htmlhelp_basename = 'pandas'
+htmlhelp_basename = "pandas"
# -- Options for nbsphinx ------------------------------------------------
@@ -371,9 +372,13 @@
# Grouping the document tree into LaTeX files. List of tuples (source start
# file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
- ('index', 'pandas.tex',
- 'pandas: powerful Python data analysis toolkit',
- r'Wes McKinney\n\& PyData Development Team', 'manual'),
+ (
+ "index",
+ "pandas.tex",
+ "pandas: powerful Python data analysis toolkit",
+ r"Wes McKinney\n\& PyData Development Team",
+ "manual",
+ )
]
# The name of an image file (relative to this directory) to place at the top of
@@ -396,32 +401,32 @@
if pattern is None:
intersphinx_mapping = {
- 'dateutil': ("https://dateutil.readthedocs.io/en/latest/", None),
- 'matplotlib': ('https://matplotlib.org/', None),
- 'numpy': ('https://docs.scipy.org/doc/numpy/', None),
- 'pandas-gbq': ('https://pandas-gbq.readthedocs.io/en/latest/', None),
- 'py': ('https://pylib.readthedocs.io/en/latest/', None),
- 'python': ('https://docs.python.org/3/', None),
- 'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None),
- 'statsmodels': ('http://www.statsmodels.org/devel/', None),
+ "dateutil": ("https://dateutil.readthedocs.io/en/latest/", None),
+ "matplotlib": ("https://matplotlib.org/", None),
+ "numpy": ("https://docs.scipy.org/doc/numpy/", None),
+ "pandas-gbq": ("https://pandas-gbq.readthedocs.io/en/latest/", None),
+ "py": ("https://pylib.readthedocs.io/en/latest/", None),
+ "python": ("https://docs.python.org/3/", None),
+ "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None),
+ "statsmodels": ("http://www.statsmodels.org/devel/", None),
}
# extlinks alias
-extlinks = {'issue': ('https://github.com/pandas-dev/pandas/issues/%s',
- 'GH'),
- 'wiki': ('https://github.com/pandas-dev/pandas/wiki/%s',
- 'wiki ')}
+extlinks = {
+ "issue": ("https://github.com/pandas-dev/pandas/issues/%s", "GH"),
+ "wiki": ("https://github.com/pandas-dev/pandas/wiki/%s", "wiki "),
+}
ipython_warning_is_error = False
ipython_exec_lines = [
- 'import numpy as np',
- 'import pandas as pd',
+ "import numpy as np",
+ "import pandas as pd",
# This ensures correct rendering on system with console encoding != utf8
# (windows). It forces pandas to encode its output reprs using utf8
# wherever the docs are built. The docs' target is the browser, not
# the console, so this is fine.
- 'pd.options.display.encoding="utf8"'
+ 'pd.options.display.encoding="utf8"',
]
@@ -430,8 +435,7 @@
import sphinx
from sphinx.util import rpartition
-from sphinx.ext.autodoc import (
- Documenter, MethodDocumenter, AttributeDocumenter)
+from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter
from sphinx.ext.autosummary import Autosummary
@@ -439,8 +443,9 @@ class AccessorDocumenter(MethodDocumenter):
"""
Specialized Documenter subclass for accessors.
"""
- objtype = 'accessor'
- directivetype = 'method'
+
+ objtype = "accessor"
+ directivetype = "method"
# lower than MethodDocumenter so this is not chosen for normal methods
priority = 0.6
@@ -448,7 +453,7 @@ class AccessorDocumenter(MethodDocumenter):
def format_signature(self):
# this method gives an error/warning for the accessors, therefore
# overriding it (accessor has no arguments)
- return ''
+ return ""
class AccessorLevelDocumenter(Documenter):
@@ -456,6 +461,7 @@ class AccessorLevelDocumenter(Documenter):
Specialized Documenter subclass for objects on accessor level (methods,
attributes).
"""
+
# This is the simple straightforward version
# modname is None, base the last elements (eg 'hour')
# and path the part before (eg 'Series.dt')
@@ -468,41 +474,40 @@ class AccessorLevelDocumenter(Documenter):
def resolve_name(self, modname, parents, path, base):
if modname is None:
if path:
- mod_cls = path.rstrip('.')
+ mod_cls = path.rstrip(".")
else:
mod_cls = None
# if documenting a class-level object without path,
# there must be a current class, either from a parent
# auto directive ...
- mod_cls = self.env.temp_data.get('autodoc:class')
+ mod_cls = self.env.temp_data.get("autodoc:class")
# ... or from a class directive
if mod_cls is None:
- mod_cls = self.env.temp_data.get('py:class')
+ mod_cls = self.env.temp_data.get("py:class")
# ... if still None, there's no way to know
if mod_cls is None:
return None, []
# HACK: this is added in comparison to ClassLevelDocumenter
# mod_cls still exists of class.accessor, so an extra
# rpartition is needed
- modname, accessor = rpartition(mod_cls, '.')
- modname, cls = rpartition(modname, '.')
+ modname, accessor = rpartition(mod_cls, ".")
+ modname, cls = rpartition(modname, ".")
parents = [cls, accessor]
# if the module name is still missing, get it like above
if not modname:
- modname = self.env.temp_data.get('autodoc:module')
+ modname = self.env.temp_data.get("autodoc:module")
if not modname:
- if sphinx.__version__ > '1.3':
- modname = self.env.ref_context.get('py:module')
+ if sphinx.__version__ > "1.3":
+ modname = self.env.ref_context.get("py:module")
else:
- modname = self.env.temp_data.get('py:module')
+ modname = self.env.temp_data.get("py:module")
# ... else, it stays None, which means invalid
return modname, parents + [base]
-class AccessorAttributeDocumenter(AccessorLevelDocumenter,
- AttributeDocumenter):
- objtype = 'accessorattribute'
- directivetype = 'attribute'
+class AccessorAttributeDocumenter(AccessorLevelDocumenter, AttributeDocumenter):
+ objtype = "accessorattribute"
+ directivetype = "attribute"
# lower than AttributeDocumenter so this is not chosen for normal
# attributes
@@ -510,8 +515,8 @@ class AccessorAttributeDocumenter(AccessorLevelDocumenter,
class AccessorMethodDocumenter(AccessorLevelDocumenter, MethodDocumenter):
- objtype = 'accessormethod'
- directivetype = 'method'
+ objtype = "accessormethod"
+ directivetype = "method"
# lower than MethodDocumenter so this is not chosen for normal methods
priority = 0.6
@@ -522,14 +527,15 @@ class AccessorCallableDocumenter(AccessorLevelDocumenter, MethodDocumenter):
This documenter lets us removes .__call__ from the method signature for
callable accessors like Series.plot
"""
- objtype = 'accessorcallable'
- directivetype = 'method'
+
+ objtype = "accessorcallable"
+ directivetype = "method"
# lower than MethodDocumenter; otherwise the doc build prints warnings
priority = 0.5
def format_name(self):
- return MethodDocumenter.format_name(self).rstrip('.__call__')
+ return MethodDocumenter.format_name(self).rstrip(".__call__")
class PandasAutosummary(Autosummary):
@@ -537,15 +543,16 @@ class PandasAutosummary(Autosummary):
This alternative autosummary class lets us override the table summary for
Series.plot and DataFrame.plot in the API docs.
"""
+
def _replace_pandas_items(self, display_name, sig, summary, real_name):
# this a hack: ideally we should extract the signature from the
# .__call__ method instead of hard coding this
- if display_name == 'DataFrame.plot':
- sig = '([x, y, kind, ax, ....])'
- summary = 'DataFrame plotting accessor and method'
- elif display_name == 'Series.plot':
- sig = '([kind, ax, figsize, ....])'
- summary = 'Series plotting accessor and method'
+ if display_name == "DataFrame.plot":
+ sig = "([x, y, kind, ax, ....])"
+ summary = "DataFrame plotting accessor and method"
+ elif display_name == "Series.plot":
+ sig = "([kind, ax, figsize, ....])"
+ summary = "Series plotting accessor and method"
return (display_name, sig, summary, real_name)
@staticmethod
@@ -554,15 +561,15 @@ def _is_deprecated(real_name):
obj, parent, modname = _import_by_name(real_name)
except ImportError:
return False
- doc = NumpyDocString(obj.__doc__ or '')
- summary = ''.join(doc['Summary'] + doc['Extended Summary'])
- return '.. deprecated::' in summary
+ doc = NumpyDocString(obj.__doc__ or "")
+ summary = "".join(doc["Summary"] + doc["Extended Summary"])
+ return ".. deprecated::" in summary
def _add_deprecation_prefixes(self, items):
for item in items:
display_name, sig, summary, real_name = item
if self._is_deprecated(real_name):
- summary = '(DEPRECATED) %s' % summary
+ summary = "(DEPRECATED) %s" % summary
yield display_name, sig, summary, real_name
def get_items(self, names):
@@ -577,18 +584,18 @@ def linkcode_resolve(domain, info):
"""
Determine the URL corresponding to Python object
"""
- if domain != 'py':
+ if domain != "py":
return None
- modname = info['module']
- fullname = info['fullname']
+ modname = info["module"]
+ fullname = info["fullname"]
submod = sys.modules.get(modname)
if submod is None:
return None
obj = submod
- for part in fullname.split('.'):
+ for part in fullname.split("."):
try:
obj = getattr(obj, part)
except AttributeError:
@@ -617,12 +624,14 @@ def linkcode_resolve(domain, info):
fn = os.path.relpath(fn, start=os.path.dirname(pandas.__file__))
- if '+' in pandas.__version__:
- return ("http://github.com/pandas-dev/pandas/blob/master/pandas/"
- "{}{}".format(fn, linespec))
+ if "+" in pandas.__version__:
+ return "http://github.com/pandas-dev/pandas/blob/master/pandas/" "{}{}".format(
+ fn, linespec
+ )
else:
- return ("http://github.com/pandas-dev/pandas/blob/"
- "v{}/pandas/{}{}".format(pandas.__version__, fn, linespec))
+ return "http://github.com/pandas-dev/pandas/blob/" "v{}/pandas/{}{}".format(
+ pandas.__version__, fn, linespec
+ )
# remove the docstring of the flags attribute (inherited from numpy ndarray)
@@ -646,7 +655,7 @@ def process_class_docstrings(app, what, name, obj, options, lines):
"""
if what == "class":
- joined = '\n'.join(lines)
+ joined = "\n".join(lines)
templates = [
""".. rubric:: Attributes
@@ -662,25 +671,25 @@ def process_class_docstrings(app, what, name, obj, options, lines):
:toctree:
None
-"""
+""",
]
for template in templates:
if template in joined:
- joined = joined.replace(template, '')
- lines[:] = joined.split('\n')
+ joined = joined.replace(template, "")
+ lines[:] = joined.split("\n")
suppress_warnings = [
# We "overwrite" autosummary with our PandasAutosummary, but
# still want the regular autosummary setup to run. So we just
# suppress this warning.
- 'app.add_directive'
+ "app.add_directive"
]
if pattern:
# When building a single document we don't want to warn because references
# to other documents are unknown, as it's expected
- suppress_warnings.append('ref.ref')
+ suppress_warnings.append("ref.ref")
def rstjinja(app, docname, source):
@@ -689,12 +698,10 @@ def rstjinja(app, docname, source):
"""
# http://ericholscher.com/blog/2016/jul/25/integrating-jinja-rst-sphinx/
# Make sure we're outputting HTML
- if app.builder.format != 'html':
+ if app.builder.format != "html":
return
src = source[0]
- rendered = app.builder.templates.render_string(
- src, app.config.html_context
- )
+ rendered = app.builder.templates.render_string(src, app.config.html_context)
source[0] = rendered
@@ -706,4 +713,4 @@ def setup(app):
app.add_autodocumenter(AccessorAttributeDocumenter)
app.add_autodocumenter(AccessorMethodDocumenter)
app.add_autodocumenter(AccessorCallableDocumenter)
- app.add_directive('autosummary', PandasAutosummary)
+ app.add_directive("autosummary", PandasAutosummary)
diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py
index 950e3592abf6e..1a5ab99b5a94f 100755
--- a/doc/sphinxext/announce.py
+++ b/doc/sphinxext/announce.py
@@ -40,7 +40,7 @@
from git import Repo
-UTF8Writer = codecs.getwriter('utf8')
+UTF8Writer = codecs.getwriter("utf8")
this_repo = Repo(os.path.join(os.path.dirname(__file__), "..", ".."))
author_msg = """\
@@ -54,21 +54,19 @@
def get_authors(revision_range):
- pat = '^.*\\t(.*)$'
- lst_release, cur_release = [r.strip() for r in revision_range.split('..')]
+ pat = "^.*\\t(.*)$"
+ lst_release, cur_release = [r.strip() for r in revision_range.split("..")]
# authors, in current release and previous to current release.
- cur = set(re.findall(pat, this_repo.git.shortlog('-s', revision_range),
- re.M))
- pre = set(re.findall(pat, this_repo.git.shortlog('-s', lst_release),
- re.M))
+ cur = set(re.findall(pat, this_repo.git.shortlog("-s", revision_range), re.M))
+ pre = set(re.findall(pat, this_repo.git.shortlog("-s", lst_release), re.M))
# Homu is the author of auto merges, clean him out.
- cur.discard('Homu')
- pre.discard('Homu')
+ cur.discard("Homu")
+ pre.discard("Homu")
# Append '+' to new authors.
- authors = [s + ' +' for s in cur - pre] + [s for s in cur & pre]
+ authors = [s + " +" for s in cur - pre] + [s for s in cur & pre]
authors.sort()
return authors
@@ -77,19 +75,19 @@ def get_pull_requests(repo, revision_range):
prnums = []
# From regular merges
- merges = this_repo.git.log(
- '--oneline', '--merges', revision_range)
+ merges = this_repo.git.log("--oneline", "--merges", revision_range)
issues = re.findall("Merge pull request \\#(\\d*)", merges)
prnums.extend(int(s) for s in issues)
# From Homu merges (Auto merges)
- issues = re. findall("Auto merge of \\#(\\d*)", merges)
+ issues = re.findall("Auto merge of \\#(\\d*)", merges)
prnums.extend(int(s) for s in issues)
# From fast forward squash-merges
commits = this_repo.git.log(
- '--oneline', '--no-merges', '--first-parent', revision_range)
- issues = re.findall('^.*\\(\\#(\\d+)\\)$', commits, re.M)
+ "--oneline", "--no-merges", "--first-parent", revision_range
+ )
+ issues = re.findall("^.*\\(\\#(\\d+)\\)$", commits, re.M)
prnums.extend(int(s) for s in issues)
# get PR data from github repo
@@ -99,27 +97,29 @@ def get_pull_requests(repo, revision_range):
def build_components(revision_range, heading="Contributors"):
- lst_release, cur_release = [r.strip() for r in revision_range.split('..')]
+ lst_release, cur_release = [r.strip() for r in revision_range.split("..")]
authors = get_authors(revision_range)
return {
- 'heading': heading,
- 'author_message': author_msg % len(authors),
- 'authors': authors,
+ "heading": heading,
+ "author_message": author_msg % len(authors),
+ "authors": authors,
}
def build_string(revision_range, heading="Contributors"):
components = build_components(revision_range, heading=heading)
- components['uline'] = '=' * len(components['heading'])
- components['authors'] = "* " + "\n* ".join(components['authors'])
+ components["uline"] = "=" * len(components["heading"])
+ components["authors"] = "* " + "\n* ".join(components["authors"])
- tpl = textwrap.dedent("""\
+ tpl = textwrap.dedent(
+ """\
{heading}
{uline}
{author_message}
- {authors}""").format(**components)
+ {authors}"""
+ ).format(**components)
return tpl
@@ -133,6 +133,6 @@ def main(revision_range):
from argparse import ArgumentParser
parser = ArgumentParser(description="Generate author lists for release")
- parser.add_argument('revision_range', help='..')
+ parser.add_argument("revision_range", help="..")
args = parser.parse_args()
main(args.revision_range)
diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py
index 7794a24dad89b..4256e4659715d 100644
--- a/doc/sphinxext/contributors.py
+++ b/doc/sphinxext/contributors.py
@@ -17,40 +17,36 @@
class ContributorsDirective(Directive):
required_arguments = 1
- name = 'contributors'
+ name = "contributors"
def run(self):
range_ = self.arguments[0]
- if range_.endswith('x..HEAD'):
+ if range_.endswith("x..HEAD"):
return [nodes.paragraph(), nodes.bullet_list()]
try:
components = build_components(range_)
except git.GitCommandError as exc:
return [
self.state.document.reporter.warning(
- "Cannot find contributors for range '{}': {}".format(
- range_, exc),
- line=self.lineno)
+ "Cannot find contributors for range '{}': {}".format(range_, exc),
+ line=self.lineno,
+ )
]
else:
message = nodes.paragraph()
- message += nodes.Text(components['author_message'])
+ message += nodes.Text(components["author_message"])
listnode = nodes.bullet_list()
- for author in components['authors']:
+ for author in components["authors"]:
para = nodes.paragraph()
para += nodes.Text(author)
- listnode += nodes.list_item('', para)
+ listnode += nodes.list_item("", para)
return [message, listnode]
def setup(app):
- app.add_directive('contributors', ContributorsDirective)
+ app.add_directive("contributors", ContributorsDirective)
- return {
- 'version': '0.1',
- 'parallel_read_safe': True,
- 'parallel_write_safe': True,
- }
+ return {"version": "0.1", "parallel_read_safe": True, "parallel_write_safe": True}
diff --git a/pandas/__init__.py b/pandas/__init__.py
index 5b39d954c2bc3..6351b508fb0e5 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -1,6 +1,6 @@
# flake8: noqa
-__docformat__ = 'restructuredtext'
+__docformat__ = "restructuredtext"
# Let users know if they're missing any of our hard dependencies
hard_dependencies = ("numpy", "pytz", "dateutil")
@@ -13,62 +13,113 @@
missing_dependencies.append("{0}: {1}".format(dependency, str(e)))
if missing_dependencies:
- raise ImportError("Unable to import required dependencies:\n" + "\n".join(missing_dependencies))
+ raise ImportError(
+ "Unable to import required dependencies:\n" + "\n".join(missing_dependencies)
+ )
del hard_dependencies, dependency, missing_dependencies
# numpy compat
from pandas.compat.numpy import (
- _np_version_under1p14, _np_version_under1p15, _np_version_under1p16,
- _np_version_under1p17)
+ _np_version_under1p14,
+ _np_version_under1p15,
+ _np_version_under1p16,
+ _np_version_under1p17,
+)
try:
- from pandas._libs import (hashtable as _hashtable,
- lib as _lib,
- tslib as _tslib)
+ from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib
except ImportError as e: # pragma: no cover
# hack but overkill to use re
- module = str(e).replace('cannot import name ', '')
- raise ImportError("C extension: {0} not built. If you want to import "
- "pandas from the source directory, you may need to run "
- "'python setup.py build_ext --inplace --force' to build "
- "the C extensions first.".format(module))
+ module = str(e).replace("cannot import name ", "")
+ raise ImportError(
+ "C extension: {0} not built. If you want to import "
+ "pandas from the source directory, you may need to run "
+ "'python setup.py build_ext --inplace --force' to build "
+ "the C extensions first.".format(module)
+ )
from datetime import datetime
-from pandas._config import (get_option, set_option, reset_option,
- describe_option, option_context, options)
+from pandas._config import (
+ get_option,
+ set_option,
+ reset_option,
+ describe_option,
+ option_context,
+ options,
+)
# let init-time option registration happen
import pandas.core.config_init
from pandas.core.api import (
# dtype
- Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype,
- UInt16Dtype, UInt32Dtype, UInt64Dtype, CategoricalDtype,
- PeriodDtype, IntervalDtype, DatetimeTZDtype,
-
+ Int8Dtype,
+ Int16Dtype,
+ Int32Dtype,
+ Int64Dtype,
+ UInt8Dtype,
+ UInt16Dtype,
+ UInt32Dtype,
+ UInt64Dtype,
+ CategoricalDtype,
+ PeriodDtype,
+ IntervalDtype,
+ DatetimeTZDtype,
# missing
- isna, isnull, notna, notnull,
-
+ isna,
+ isnull,
+ notna,
+ notnull,
# indexes
- Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex,
- Float64Index, MultiIndex, IntervalIndex, TimedeltaIndex,
- DatetimeIndex, PeriodIndex, IndexSlice,
-
+ Index,
+ CategoricalIndex,
+ Int64Index,
+ UInt64Index,
+ RangeIndex,
+ Float64Index,
+ MultiIndex,
+ IntervalIndex,
+ TimedeltaIndex,
+ DatetimeIndex,
+ PeriodIndex,
+ IndexSlice,
# tseries
- NaT, Period, period_range, Timedelta, timedelta_range,
- Timestamp, date_range, bdate_range, Interval, interval_range,
+ NaT,
+ Period,
+ period_range,
+ Timedelta,
+ timedelta_range,
+ Timestamp,
+ date_range,
+ bdate_range,
+ Interval,
+ interval_range,
DateOffset,
-
# conversion
- to_numeric, to_datetime, to_timedelta,
-
+ to_numeric,
+ to_datetime,
+ to_timedelta,
# misc
- np, Grouper, factorize, unique, value_counts, NamedAgg,
- array, Categorical, set_eng_float_format, Series, DataFrame)
+ np,
+ Grouper,
+ factorize,
+ unique,
+ value_counts,
+ NamedAgg,
+ array,
+ Categorical,
+ set_eng_float_format,
+ Series,
+ DataFrame,
+)
from pandas.core.sparse.api import (
- SparseArray, SparseDataFrame, SparseSeries, SparseDtype)
+ SparseArray,
+ SparseDataFrame,
+ SparseSeries,
+ SparseDtype,
+)
from pandas.tseries.api import infer_freq
from pandas.tseries import offsets
@@ -76,35 +127,56 @@
from pandas.core.computation.api import eval
from pandas.core.reshape.api import (
- concat, lreshape, melt, wide_to_long, merge, merge_asof,
- merge_ordered, crosstab, pivot, pivot_table, get_dummies,
- cut, qcut)
+ concat,
+ lreshape,
+ melt,
+ wide_to_long,
+ merge,
+ merge_asof,
+ merge_ordered,
+ crosstab,
+ pivot,
+ pivot_table,
+ get_dummies,
+ cut,
+ qcut,
+)
from pandas.util._print_versions import show_versions
from pandas.io.api import (
# excel
- ExcelFile, ExcelWriter, read_excel,
-
+ ExcelFile,
+ ExcelWriter,
+ read_excel,
# packers
- read_msgpack, to_msgpack,
-
+ read_msgpack,
+ to_msgpack,
# parsers
- read_csv, read_fwf, read_table,
-
+ read_csv,
+ read_fwf,
+ read_table,
# pickle
- read_pickle, to_pickle,
-
+ read_pickle,
+ to_pickle,
# pytables
- HDFStore, read_hdf,
-
+ HDFStore,
+ read_hdf,
# sql
- read_sql, read_sql_query,
+ read_sql,
+ read_sql_query,
read_sql_table,
-
# misc
- read_clipboard, read_parquet, read_feather, read_gbq,
- read_html, read_json, read_stata, read_sas, read_spss)
+ read_clipboard,
+ read_parquet,
+ read_feather,
+ read_gbq,
+ read_html,
+ read_json,
+ read_stata,
+ read_sas,
+ read_spss,
+)
from pandas.util._tester import test
import pandas.testing
@@ -112,31 +184,38 @@
# use the closest tagged version if possible
from ._version import get_versions
+
v = get_versions()
-__version__ = v.get('closest-tag', v['version'])
-__git_version__ = v.get('full-revisionid')
+__version__ = v.get("closest-tag", v["version"])
+__git_version__ = v.get("full-revisionid")
del get_versions, v
# GH 27101
# TODO: remove Panel compat in 1.0
if pandas.compat.PY37:
+
def __getattr__(name):
- if name == 'Panel':
+ if name == "Panel":
import warnings
+
warnings.warn(
"The Panel class is removed from pandas. Accessing it "
"from the top-level namespace will also be removed in "
"the next version",
- FutureWarning, stacklevel=2)
+ FutureWarning,
+ stacklevel=2,
+ )
class Panel:
pass
return Panel
- raise AttributeError(
- "module 'pandas' has no attribute '{}'".format(name))
+ raise AttributeError("module 'pandas' has no attribute '{}'".format(name))
+
+
else:
+
class Panel:
pass
diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py
index bf221ea444288..65936a9fcdbf3 100644
--- a/pandas/_config/__init__.py
+++ b/pandas/_config/__init__.py
@@ -5,11 +5,24 @@
importing `dates` and `display` ensures that keys needed by _libs
are initialized.
"""
-__all__ = ["config", "detect_console_encoding", "get_option", "set_option",
- "reset_option", "describe_option", "option_context", "options"]
+__all__ = [
+ "config",
+ "detect_console_encoding",
+ "get_option",
+ "set_option",
+ "reset_option",
+ "describe_option",
+ "option_context",
+ "options",
+]
from pandas._config import config
from pandas._config import dates # noqa:F401
from pandas._config.config import (
- describe_option, get_option, option_context, options, reset_option,
- set_option)
+ describe_option,
+ get_option,
+ option_context,
+ options,
+ reset_option,
+ set_option,
+)
from pandas._config.display import detect_console_encoding
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index 6b685a0ce962a..61e926035c3f2 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -54,9 +54,8 @@
from typing import Dict, List
import warnings
-DeprecatedOption = namedtuple('DeprecatedOption', 'key msg rkey removal_ver')
-RegisteredOption = namedtuple('RegisteredOption',
- 'key defval doc validator cb')
+DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver")
+RegisteredOption = namedtuple("RegisteredOption", "key defval doc validator cb")
# holds deprecated option metdata
_deprecated_options = {} # type: Dict[str, DeprecatedOption]
@@ -68,7 +67,7 @@
_global_config = {} # type: Dict[str, str]
# keys which have a special meaning
-_reserved_keys = ['all'] # type: List[str]
+_reserved_keys = ["all"] # type: List[str]
class OptionError(AttributeError, KeyError):
@@ -76,6 +75,7 @@ class OptionError(AttributeError, KeyError):
checks
"""
+
#
# User API
@@ -85,9 +85,9 @@ def _get_single_key(pat, silent):
if len(keys) == 0:
if not silent:
_warn_if_deprecated(pat)
- raise OptionError('No such keys(s): {pat!r}'.format(pat=pat))
+ raise OptionError("No such keys(s): {pat!r}".format(pat=pat))
if len(keys) > 1:
- raise OptionError('Pattern matched multiple keys')
+ raise OptionError("Pattern matched multiple keys")
key = keys[0]
if not silent:
@@ -110,11 +110,10 @@ def _set_option(*args, **kwargs):
# must at least 1 arg deal with constraints later
nargs = len(args)
if not nargs or nargs % 2 != 0:
- raise ValueError("Must provide an even number of non-keyword "
- "arguments")
+ raise ValueError("Must provide an even number of non-keyword " "arguments")
# default to false
- silent = kwargs.pop('silent', False)
+ silent = kwargs.pop("silent", False)
if kwargs:
msg = '_set_option() got an unexpected keyword argument "{kwarg}"'
@@ -139,13 +138,13 @@ def _set_option(*args, **kwargs):
o.cb(key)
-def _describe_option(pat='', _print_desc=True):
+def _describe_option(pat="", _print_desc=True):
keys = _select_options(pat)
if len(keys) == 0:
- raise OptionError('No such keys(s)')
+ raise OptionError("No such keys(s)")
- s = ''
+ s = ""
for k in keys: # filter by pat
s += _build_option_description(k)
@@ -160,13 +159,15 @@ def _reset_option(pat, silent=False):
keys = _select_options(pat)
if len(keys) == 0:
- raise OptionError('No such keys(s)')
+ raise OptionError("No such keys(s)")
- if len(keys) > 1 and len(pat) < 4 and pat != 'all':
- raise ValueError('You must specify at least 4 characters when '
- 'resetting multiple keys, use the special keyword '
- '"all" to reset all the options to their default '
- 'value')
+ if len(keys) > 1 and len(pat) < 4 and pat != "all":
+ raise ValueError(
+ "You must specify at least 4 characters when "
+ "resetting multiple keys, use the special keyword "
+ '"all" to reset all the options to their default '
+ "value"
+ )
for k in keys:
_set_option(k, _registered_options[k].defval, silent=silent)
@@ -213,6 +214,7 @@ def __getattr__(self, key):
def __dir__(self):
return list(self.d.keys())
+
# For user convenience, we'd like to have the available options described
# in the docstring. For dev convenience we'd like to generate the docstrings
# dynamically instead of maintaining them by hand. To this, we use the
@@ -223,7 +225,6 @@ def __dir__(self):
class CallableDynamicDoc:
-
def __init__(self, func, doc_tmpl):
self.__doc_tmpl__ = doc_tmpl
self.__func__ = func
@@ -233,10 +234,9 @@ def __call__(self, *args, **kwds):
@property
def __doc__(self):
- opts_desc = _describe_option('all', _print_desc=False)
+ opts_desc = _describe_option("all", _print_desc=False)
opts_list = pp_options_list(list(_registered_options.keys()))
- return self.__doc_tmpl__.format(opts_desc=opts_desc,
- opts_list=opts_list)
+ return self.__doc_tmpl__.format(opts_desc=opts_desc, opts_list=opts_list)
_get_option_tmpl = """
@@ -394,14 +394,14 @@ class option_context:
def __init__(self, *args):
if not (len(args) % 2 == 0 and len(args) >= 2):
- raise ValueError('Need to invoke as'
- ' option_context(pat, val, [(pat, val), ...]).')
+ raise ValueError(
+ "Need to invoke as" " option_context(pat, val, [(pat, val), ...])."
+ )
self.ops = list(zip(args[::2], args[1::2]))
def __enter__(self):
- self.undo = [(pat, _get_option(pat, silent=True))
- for pat, val in self.ops]
+ self.undo = [(pat, _get_option(pat, silent=True)) for pat, val in self.ops]
for pat, val in self.ops:
_set_option(pat, val, silent=True)
@@ -412,7 +412,7 @@ def __exit__(self, *args):
_set_option(pat, val, silent=True)
-def register_option(key, defval, doc='', validator=None, cb=None):
+def register_option(key, defval, doc="", validator=None, cb=None):
"""Register an option in the package-wide pandas config object
Parameters
@@ -437,6 +437,7 @@ def register_option(key, defval, doc='', validator=None, cb=None):
"""
import tokenize
import keyword
+
key = key.lower()
if key in _registered_options:
@@ -451,10 +452,10 @@ def register_option(key, defval, doc='', validator=None, cb=None):
validator(defval)
# walk the nested dict, creating dicts as needed along the path
- path = key.split('.')
+ path = key.split(".")
for k in path:
- if not bool(re.match('^' + tokenize.Name + '$', k)):
+ if not bool(re.match("^" + tokenize.Name + "$", k)):
raise ValueError("{k} is not a valid identifier".format(k=k))
if keyword.iskeyword(k):
raise ValueError("{k} is a python keyword".format(k=k))
@@ -463,20 +464,20 @@ def register_option(key, defval, doc='', validator=None, cb=None):
msg = "Path prefix to option '{option}' is already an option"
for i, p in enumerate(path[:-1]):
if not isinstance(cursor, dict):
- raise OptionError(msg.format(option='.'.join(path[:i])))
+ raise OptionError(msg.format(option=".".join(path[:i])))
if p not in cursor:
cursor[p] = {}
cursor = cursor[p]
if not isinstance(cursor, dict):
- raise OptionError(msg.format(option='.'.join(path[:-1])))
+ raise OptionError(msg.format(option=".".join(path[:-1])))
cursor[path[-1]] = defval # initialize
# save the option metadata
- _registered_options[key] = RegisteredOption(key=key, defval=defval,
- doc=doc, validator=validator,
- cb=cb)
+ _registered_options[key] = RegisteredOption(
+ key=key, defval=defval, doc=doc, validator=validator, cb=cb
+ )
def deprecate_option(key, msg=None, rkey=None, removal_ver=None):
@@ -526,6 +527,7 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None):
_deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver)
+
#
# functions internal to the module
@@ -542,14 +544,14 @@ def _select_options(pat):
# else look through all of them
keys = sorted(_registered_options.keys())
- if pat == 'all': # reserved key
+ if pat == "all": # reserved key
return keys
return [k for k in keys if re.search(pat, k, re.I)]
def _get_root(key):
- path = key.split('.')
+ path = key.split(".")
cursor = _global_config
for p in path[:-1]:
cursor = cursor[p]
@@ -621,12 +623,11 @@ def _warn_if_deprecated(key):
else:
msg = "'{key}' is deprecated".format(key=key)
if d.removal_ver:
- msg += (' and will be removed in {version}'
- .format(version=d.removal_ver))
+ msg += " and will be removed in {version}".format(version=d.removal_ver)
if d.rkey:
msg += ", please use '{rkey}' instead.".format(rkey=d.rkey)
else:
- msg += ', please refrain from using it.'
+ msg += ", please refrain from using it."
warnings.warn(msg, FutureWarning)
return True
@@ -639,22 +640,22 @@ def _build_option_description(k):
o = _get_registered_option(k)
d = _get_deprecated_option(k)
- s = '{k} '.format(k=k)
+ s = "{k} ".format(k=k)
if o.doc:
- s += '\n'.join(o.doc.strip().split('\n'))
+ s += "\n".join(o.doc.strip().split("\n"))
else:
- s += 'No description available.'
+ s += "No description available."
if o:
- s += ('\n [default: {default}] [currently: {current}]'
- .format(default=o.defval, current=_get_option(k, True)))
+ s += "\n [default: {default}] [currently: {current}]".format(
+ default=o.defval, current=_get_option(k, True)
+ )
if d:
- s += '\n (Deprecated'
- s += (', use `{rkey}` instead.'
- .format(rkey=d.rkey if d.rkey else ''))
- s += ')'
+ s += "\n (Deprecated"
+ s += ", use `{rkey}` instead.".format(rkey=d.rkey if d.rkey else "")
+ s += ")"
return s
@@ -666,28 +667,34 @@ def pp_options_list(keys, width=80, _print=False):
from itertools import groupby
def pp(name, ks):
- pfx = ('- ' + name + '.[' if name else '')
- ls = wrap(', '.join(ks), width, initial_indent=pfx,
- subsequent_indent=' ', break_long_words=False)
+ pfx = "- " + name + ".[" if name else ""
+ ls = wrap(
+ ", ".join(ks),
+ width,
+ initial_indent=pfx,
+ subsequent_indent=" ",
+ break_long_words=False,
+ )
if ls and ls[-1] and name:
- ls[-1] = ls[-1] + ']'
+ ls[-1] = ls[-1] + "]"
return ls
ls = []
- singles = [x for x in sorted(keys) if x.find('.') < 0]
+ singles = [x for x in sorted(keys) if x.find(".") < 0]
if singles:
- ls += pp('', singles)
- keys = [x for x in keys if x.find('.') >= 0]
+ ls += pp("", singles)
+ keys = [x for x in keys if x.find(".") >= 0]
- for k, g in groupby(sorted(keys), lambda x: x[:x.rfind('.')]):
- ks = [x[len(k) + 1:] for x in list(g)]
+ for k, g in groupby(sorted(keys), lambda x: x[: x.rfind(".")]):
+ ks = [x[len(k) + 1 :] for x in list(g)]
ls += pp(k, ks)
- s = '\n'.join(ls)
+ s = "\n".join(ls)
if _print:
print(s)
else:
return s
+
#
# helpers
@@ -724,7 +731,7 @@ def config_prefix(prefix):
def wrap(func):
def inner(key, *args, **kwds):
- pkey = '{prefix}.{key}'.format(prefix=prefix, key=key)
+ pkey = "{prefix}.{key}".format(prefix=prefix, key=key)
return func(pkey, *args, **kwds)
return inner
@@ -740,6 +747,7 @@ def inner(key, *args, **kwds):
get_option = _get_option
register_option = _register_option
+
# These factories and methods are handy for use as the validator
# arg in register_option
diff --git a/pandas/_config/dates.py b/pandas/_config/dates.py
index 85300a308de62..5bf2b49ce5904 100644
--- a/pandas/_config/dates.py
+++ b/pandas/_config/dates.py
@@ -13,9 +13,11 @@
When True, prints and parses dates with the year first, eg 2005/01/20
"""
-with cf.config_prefix('display'):
+with cf.config_prefix("display"):
# Needed upstream of `_libs` because these are used in tslibs.parsing
- cf.register_option('date_dayfirst', False, pc_date_dayfirst_doc,
- validator=cf.is_bool)
- cf.register_option('date_yearfirst', False, pc_date_yearfirst_doc,
- validator=cf.is_bool)
+ cf.register_option(
+ "date_dayfirst", False, pc_date_dayfirst_doc, validator=cf.is_bool
+ )
+ cf.register_option(
+ "date_yearfirst", False, pc_date_yearfirst_doc, validator=cf.is_bool
+ )
diff --git a/pandas/_config/display.py b/pandas/_config/display.py
index 7997d12e06aa9..6e5fabe2706e5 100644
--- a/pandas/_config/display.py
+++ b/pandas/_config/display.py
@@ -25,14 +25,14 @@ def detect_console_encoding():
pass
# try again for something better
- if not encoding or 'ascii' in encoding.lower():
+ if not encoding or "ascii" in encoding.lower():
try:
encoding = locale.getpreferredencoding()
except Exception:
pass
# when all else fails. this will usually be "ascii"
- if not encoding or 'ascii' in encoding.lower():
+ if not encoding or "ascii" in encoding.lower():
encoding = sys.getdefaultencoding()
# GH#3360, save the reported defencoding at import time
@@ -50,6 +50,7 @@ def detect_console_encoding():
these are generally strings meant to be displayed on the console.
"""
-with cf.config_prefix('display'):
- cf.register_option('encoding', detect_console_encoding(), pc_encoding_doc,
- validator=cf.is_text)
+with cf.config_prefix("display"):
+ cf.register_option(
+ "encoding", detect_console_encoding(), pc_encoding_doc, validator=cf.is_text
+ )
diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py
index 1ca6d073f18c4..46802c6460959 100644
--- a/pandas/_config/localization.py
+++ b/pandas/_config/localization.py
@@ -37,7 +37,7 @@ def set_locale(new_locale, lc_var=locale.LC_ALL):
locale.setlocale(lc_var, new_locale)
normalized_locale = locale.getlocale()
if all(x is not None for x in normalized_locale):
- yield '.'.join(normalized_locale)
+ yield ".".join(normalized_locale)
else:
yield new_locale
finally:
@@ -99,15 +99,16 @@ def _valid_locales(locales, normalize):
def _default_locale_getter():
try:
- raw_locales = subprocess.check_output(['locale -a'], shell=True)
+ raw_locales = subprocess.check_output(["locale -a"], shell=True)
except subprocess.CalledProcessError as e:
- raise type(e)("{exception}, the 'locale -a' command cannot be found "
- "on your system".format(exception=e))
+ raise type(e)(
+ "{exception}, the 'locale -a' command cannot be found "
+ "on your system".format(exception=e)
+ )
return raw_locales
-def get_locales(prefix=None, normalize=True,
- locale_getter=_default_locale_getter):
+def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_getter):
"""
Get all the locales that are available on the system.
@@ -145,11 +146,10 @@ def get_locales(prefix=None, normalize=True,
# raw_locales is "\n" separated list of locales
# it may contain non-decodable parts, so split
# extract what we can and then rejoin.
- raw_locales = raw_locales.split(b'\n')
+ raw_locales = raw_locales.split(b"\n")
out_locales = []
for x in raw_locales:
- out_locales.append(str(
- x, encoding=options.display.encoding))
+ out_locales.append(str(x, encoding=options.display.encoding))
except TypeError:
pass
@@ -157,6 +157,6 @@ def get_locales(prefix=None, normalize=True,
if prefix is None:
return _valid_locales(out_locales, normalize)
- pattern = re.compile('{prefix}.*'.format(prefix=prefix))
- found = pattern.findall('\n'.join(out_locales))
+ pattern = re.compile("{prefix}.*".format(prefix=prefix))
+ found = pattern.findall("\n".join(out_locales))
return _valid_locales(found, normalize)
diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py
index fcf5ffbfcad92..af67cb3be7102 100644
--- a/pandas/_libs/__init__.py
+++ b/pandas/_libs/__init__.py
@@ -1,4 +1,11 @@
# flake8: noqa
from .tslibs import (
- NaT, NaTType, OutOfBoundsDatetime, Period, Timedelta, Timestamp, iNaT)
+ NaT,
+ NaTType,
+ OutOfBoundsDatetime,
+ Period,
+ Timedelta,
+ Timestamp,
+ iNaT,
+)
diff --git a/pandas/_typing.py b/pandas/_typing.py
index 8947e98bf52ce..46b1b4685ec9f 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -9,19 +9,25 @@
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCExtensionArray, ABCIndexClass, ABCSeries, ABCSparseSeries)
+ ABCDataFrame,
+ ABCExtensionArray,
+ ABCIndexClass,
+ ABCSeries,
+ ABCSparseSeries,
+)
-AnyArrayLike = TypeVar('AnyArrayLike',
- ABCExtensionArray,
- ABCIndexClass,
- ABCSeries,
- ABCSparseSeries,
- np.ndarray)
-ArrayLike = TypeVar('ArrayLike', ABCExtensionArray, np.ndarray)
-DatetimeLikeScalar = TypeVar('DatetimeLikeScalar', Period, Timestamp,
- Timedelta)
+AnyArrayLike = TypeVar(
+ "AnyArrayLike",
+ ABCExtensionArray,
+ ABCIndexClass,
+ ABCSeries,
+ ABCSparseSeries,
+ np.ndarray,
+)
+ArrayLike = TypeVar("ArrayLike", ABCExtensionArray, np.ndarray)
+DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", Period, Timestamp, Timedelta)
Dtype = Union[str, np.dtype, ExtensionDtype]
FilePathOrBuffer = Union[str, Path, IO[AnyStr]]
-FrameOrSeries = TypeVar('FrameOrSeries', ABCSeries, ABCDataFrame)
+FrameOrSeries = TypeVar("FrameOrSeries", ABCSeries, ABCDataFrame)
Scalar = Union[str, int, float]
diff --git a/pandas/_version.py b/pandas/_version.py
index 5031f411270a1..4f5bdf59a99d5 100644
--- a/pandas/_version.py
+++ b/pandas/_version.py
@@ -56,6 +56,7 @@ def decorate(f: Callable) -> Callable:
HANDLERS[vcs] = {}
HANDLERS[vcs][method] = f
return f
+
return decorate
@@ -66,9 +67,12 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False):
try:
dispcmd = str([c] + args)
# remember shell=False, so use git.cmd on windows, not just git
- p = subprocess.Popen([c] + args, cwd=cwd, stdout=subprocess.PIPE,
- stderr=(subprocess.PIPE if hide_stderr
- else None))
+ p = subprocess.Popen(
+ [c] + args,
+ cwd=cwd,
+ stdout=subprocess.PIPE,
+ stderr=(subprocess.PIPE if hide_stderr else None),
+ )
break
except EnvironmentError:
e = sys.exc_info()[1]
@@ -96,14 +100,19 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
dirname = os.path.basename(root)
if not dirname.startswith(parentdir_prefix):
if verbose:
- print("guessing rootdir is '{root}', but '{dirname}' "
- "doesn't start with prefix '{parentdir_prefix}'".format(
- root=root, dirname=dirname,
- parentdir_prefix=parentdir_prefix))
+ print(
+ "guessing rootdir is '{root}', but '{dirname}' "
+ "doesn't start with prefix '{parentdir_prefix}'".format(
+ root=root, dirname=dirname, parentdir_prefix=parentdir_prefix
+ )
+ )
raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
- return {"version": dirname[len(parentdir_prefix):],
- "full-revisionid": None,
- "dirty": False, "error": None}
+ return {
+ "version": dirname[len(parentdir_prefix) :],
+ "full-revisionid": None,
+ "dirty": False,
+ "error": None,
+ }
@register_vcs_handler("git", "get_keywords")
@@ -143,7 +152,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
# just "foo-1.0". If we see a "tag: " prefix, prefer those.
TAG = "tag: "
- tags = {r[len(TAG):] for r in refs if r.startswith(TAG)}
+ tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)}
if not tags:
# Either we're using git < 1.8.3, or there really are no tags. We use
# a heuristic: assume all version tags have a digit. The old git %d
@@ -152,7 +161,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
# between branches and tags. By ignoring refnames without digits, we
# filter out many common branch names like "release" and
# "stabilization", as well as "HEAD" and "master".
- tags = {r for r in refs if re.search(r'\d', r)}
+ tags = {r for r in refs if re.search(r"\d", r)}
if verbose:
print("discarding '{}', no digits".format(",".join(refs - tags)))
if verbose:
@@ -160,19 +169,24 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
for ref in sorted(tags):
# sorting will prefer e.g. "2.0" over "2.0rc1"
if ref.startswith(tag_prefix):
- r = ref[len(tag_prefix):]
+ r = ref[len(tag_prefix) :]
if verbose:
print("picking {r}".format(r=r))
- return {"version": r,
- "full-revisionid": keywords["full"].strip(),
- "dirty": False, "error": None
- }
+ return {
+ "version": r,
+ "full-revisionid": keywords["full"].strip(),
+ "dirty": False,
+ "error": None,
+ }
# no suitable tags, so version is "0+unknown", but full hex is still there
if verbose:
print("no suitable tags, using unknown + full revision id")
- return {"version": "0+unknown",
- "full-revisionid": keywords["full"].strip(),
- "dirty": False, "error": "no suitable tags"}
+ return {
+ "version": "0+unknown",
+ "full-revisionid": keywords["full"].strip(),
+ "dirty": False,
+ "error": "no suitable tags",
+ }
@register_vcs_handler("git", "pieces_from_vcs")
@@ -192,9 +206,9 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
GITS = ["git.cmd", "git.exe"]
# if there is a tag, this yields TAG-NUM-gHEX[-dirty]
# if there are no tags, this yields HEX[-dirty] (no NUM)
- describe_out = run_command(GITS, ["describe", "--tags", "--dirty",
- "--always", "--long"],
- cwd=root)
+ describe_out = run_command(
+ GITS, ["describe", "--tags", "--dirty", "--always", "--long"], cwd=root
+ )
# --long was added in git-1.5.5
if describe_out is None:
raise NotThisMethod("'git describe' failed")
@@ -217,32 +231,32 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
dirty = git_describe.endswith("-dirty")
pieces["dirty"] = dirty
if dirty:
- git_describe = git_describe[:git_describe.rindex("-dirty")]
+ git_describe = git_describe[: git_describe.rindex("-dirty")]
# now we have TAG-NUM-gHEX or HEX
if "-" in git_describe:
# TAG-NUM-gHEX
- mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+ mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
if not mo:
# unparseable. Maybe git-describe is misbehaving?
- pieces["error"] = ("unable to parse git-describe output: "
- "'{describe_out}'".format(
- describe_out=describe_out))
+ pieces["error"] = (
+ "unable to parse git-describe output: "
+ "'{describe_out}'".format(describe_out=describe_out)
+ )
return pieces
# tag
full_tag = mo.group(1)
if not full_tag.startswith(tag_prefix):
- fmt = ("tag '{full_tag}' doesn't start with prefix "
- "'{tag_prefix}'")
+ fmt = "tag '{full_tag}' doesn't start with prefix " "'{tag_prefix}'"
msg = fmt.format(full_tag=full_tag, tag_prefix=tag_prefix)
if verbose:
print(msg)
pieces["error"] = msg
return pieces
- pieces["closest-tag"] = full_tag[len(tag_prefix):]
+ pieces["closest-tag"] = full_tag[len(tag_prefix) :]
# distance: number of commits since tag
pieces["distance"] = int(mo.group(2))
@@ -253,8 +267,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
else:
# HEX: no tags
pieces["closest-tag"] = None
- count_out = run_command(GITS, ["rev-list", "HEAD", "--count"],
- cwd=root)
+ count_out = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
pieces["distance"] = int(count_out) # total number of commits
return pieces
@@ -283,8 +296,7 @@ def render_pep440(pieces):
rendered += ".dirty"
else:
# exception #1
- rendered = "0+untagged.{:d}.g{}".format(pieces["distance"],
- pieces["short"])
+ rendered = "0+untagged.{:d}.g{}".format(pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
return rendered
@@ -391,10 +403,12 @@ def render_git_describe_long(pieces):
def render(pieces, style):
if pieces["error"]:
- return {"version": "unknown",
- "full-revisionid": pieces.get("long"),
- "dirty": None,
- "error": pieces["error"]}
+ return {
+ "version": "unknown",
+ "full-revisionid": pieces.get("long"),
+ "dirty": None,
+ "error": pieces["error"],
+ }
if not style or style == "default":
style = "pep440" # the default
@@ -414,8 +428,12 @@ def render(pieces, style):
else:
raise ValueError("unknown style '{style}'".format(style=style))
- return {"version": rendered, "full-revisionid": pieces["long"],
- "dirty": pieces["dirty"], "error": None}
+ return {
+ "version": rendered,
+ "full-revisionid": pieces["long"],
+ "dirty": pieces["dirty"],
+ "error": None,
+ }
def get_versions():
@@ -428,8 +446,7 @@ def get_versions():
verbose = cfg.verbose
try:
- return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
- verbose)
+ return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
except NotThisMethod:
pass
@@ -438,12 +455,15 @@ def get_versions():
# versionfile_source is the relative path from the top of the source
# tree (where the .git directory might live) to this file. Invert
# this to find the root from __file__.
- for i in cfg.versionfile_source.split('/'):
+ for i in cfg.versionfile_source.split("/"):
root = os.path.dirname(root)
except NameError:
- return {"version": "0+unknown", "full-revisionid": None,
- "dirty": None,
- "error": "unable to find root of source tree"}
+ return {
+ "version": "0+unknown",
+ "full-revisionid": None,
+ "dirty": None,
+ "error": "unable to find root of source tree",
+ }
try:
pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
@@ -457,6 +477,9 @@ def get_versions():
except NotThisMethod:
pass
- return {"version": "0+unknown", "full-revisionid": None,
- "dirty": None,
- "error": "unable to compute version"}
+ return {
+ "version": "0+unknown",
+ "full-revisionid": None,
+ "dirty": None,
+ "error": "unable to compute version",
+ }
diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py
index 0bd2733cb494c..431dd2b1968ae 100644
--- a/pandas/api/extensions/__init__.py
+++ b/pandas/api/extensions/__init__.py
@@ -1,12 +1,14 @@
"""Public API for extending pandas objects."""
from pandas.core.dtypes.dtypes import ( # noqa: F401
- ExtensionDtype, register_extension_dtype)
+ ExtensionDtype,
+ register_extension_dtype,
+)
from pandas.core.accessor import ( # noqa: F401
- register_index_accessor, register_series_accessor)
+ register_index_accessor,
+ register_series_accessor,
+)
from pandas.core.algorithms import take # noqa: F401
-from pandas.core.arrays import ( # noqa: F401
- ExtensionArray, ExtensionScalarOpsMixin)
+from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401
-from pandas.core.accessor import ( # noqa: F401; noqa: F401
- register_dataframe_accessor)
+from pandas.core.accessor import register_dataframe_accessor # noqa: F401; noqa: F401
diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py
index 668f79921d8e6..f32e1abe28cc1 100644
--- a/pandas/api/types/__init__.py
+++ b/pandas/api/types/__init__.py
@@ -5,4 +5,8 @@
from pandas.core.dtypes.api import * # noqa: F403, F401
from pandas.core.dtypes.concat import union_categoricals # noqa: F401
from pandas.core.dtypes.dtypes import ( # noqa: F401
- CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype)
+ CategoricalDtype,
+ DatetimeTZDtype,
+ IntervalDtype,
+ PeriodDtype,
+)
diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py
index ab014d49236b3..db01f2a0c674f 100644
--- a/pandas/arrays/__init__.py
+++ b/pandas/arrays/__init__.py
@@ -4,16 +4,23 @@
See :ref:`extending.extension-types` for more.
"""
from pandas.core.arrays import (
- Categorical, DatetimeArray, IntegerArray, IntervalArray, PandasArray,
- PeriodArray, SparseArray, TimedeltaArray)
+ Categorical,
+ DatetimeArray,
+ IntegerArray,
+ IntervalArray,
+ PandasArray,
+ PeriodArray,
+ SparseArray,
+ TimedeltaArray,
+)
__all__ = [
- 'Categorical',
- 'DatetimeArray',
- 'IntegerArray',
- 'IntervalArray',
- 'PandasArray',
- 'PeriodArray',
- 'SparseArray',
- 'TimedeltaArray',
+ "Categorical",
+ "DatetimeArray",
+ "IntegerArray",
+ "IntervalArray",
+ "PandasArray",
+ "PeriodArray",
+ "SparseArray",
+ "TimedeltaArray",
]
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
index 4459e66540dac..c9597505fa596 100644
--- a/pandas/compat/__init__.py
+++ b/pandas/compat/__init__.py
@@ -13,7 +13,7 @@
PY36 = sys.version_info >= (3, 6)
PY37 = sys.version_info >= (3, 7)
-PYPY = platform.python_implementation() == 'PyPy'
+PYPY = platform.python_implementation() == "PyPy"
# ----------------------------------------------------------------------------
@@ -29,9 +29,7 @@ def set_function_name(f, name, cls):
Bind the name/qualname attributes of the function
"""
f.__name__ = name
- f.__qualname__ = '{klass}.{name}'.format(
- klass=cls.__name__,
- name=name)
+ f.__qualname__ = "{klass}.{name}".format(klass=cls.__name__, name=name)
f.__module__ = cls.__module__
return f
@@ -49,19 +47,19 @@ def raise_with_traceback(exc, traceback=Ellipsis):
# https://github.com/pandas-dev/pandas/pull/9123
def is_platform_little_endian():
""" am I little endian """
- return sys.byteorder == 'little'
+ return sys.byteorder == "little"
def is_platform_windows():
- return sys.platform == 'win32' or sys.platform == 'cygwin'
+ return sys.platform == "win32" or sys.platform == "cygwin"
def is_platform_linux():
- return sys.platform == 'linux2'
+ return sys.platform == "linux2"
def is_platform_mac():
- return sys.platform == 'darwin'
+ return sys.platform == "darwin"
def is_platform_32bit():
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 620884d66821c..cd4e1b7e8aa4d 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -39,23 +39,18 @@
def _get_version(module: types.ModuleType) -> str:
- version = getattr(module, '__version__', None)
+ version = getattr(module, "__version__", None)
if version is None:
# xlrd uses a capitalized attribute name
- version = getattr(module, '__VERSION__', None)
+ version = getattr(module, "__VERSION__", None)
if version is None:
- raise ImportError(
- "Can't determine version for {}".format(module.__name__)
- )
+ raise ImportError("Can't determine version for {}".format(module.__name__))
return version
def import_optional_dependency(
- name: str,
- extra: str = "",
- raise_on_missing: bool = True,
- on_version: str = "raise",
+ name: str, extra: str = "", raise_on_missing: bool = True, on_version: str = "raise"
):
"""
Import an optional dependency.
@@ -105,9 +100,7 @@ def import_optional_dependency(
if distutils.version.LooseVersion(version) < minimum_version:
assert on_version in {"warn", "raise", "ignore"}
msg = version_message.format(
- minimum_version=minimum_version,
- name=name,
- actual_version=version,
+ minimum_version=minimum_version, name=name, actual_version=version
)
if on_version == "warn":
warnings.warn(msg, UserWarning)
diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py
index e57a2ba3af0ac..83f1da597d6a6 100644
--- a/pandas/compat/chainmap.py
+++ b/pandas/compat/chainmap.py
@@ -2,7 +2,6 @@
class DeepChainMap(ChainMap):
-
def __setitem__(self, key, value):
for mapping in self.maps:
if key in mapping:
diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py
index 22bfab8b7c6d6..ce56c08d3ec14 100644
--- a/pandas/compat/numpy/__init__.py
+++ b/pandas/compat/numpy/__init__.py
@@ -8,27 +8,29 @@
# numpy versioning
_np_version = np.__version__
_nlv = LooseVersion(_np_version)
-_np_version_under1p14 = _nlv < LooseVersion('1.14')
-_np_version_under1p15 = _nlv < LooseVersion('1.15')
-_np_version_under1p16 = _nlv < LooseVersion('1.16')
-_np_version_under1p17 = _nlv < LooseVersion('1.17')
-_is_numpy_dev = '.dev' in str(_nlv)
+_np_version_under1p14 = _nlv < LooseVersion("1.14")
+_np_version_under1p15 = _nlv < LooseVersion("1.15")
+_np_version_under1p16 = _nlv < LooseVersion("1.16")
+_np_version_under1p17 = _nlv < LooseVersion("1.17")
+_is_numpy_dev = ".dev" in str(_nlv)
-if _nlv < '1.13.3':
- raise ImportError('this version of pandas is incompatible with '
- 'numpy < 1.13.3\n'
- 'your numpy version is {0}.\n'
- 'Please upgrade numpy to >= 1.13.3 to use '
- 'this pandas version'.format(_np_version))
+if _nlv < "1.13.3":
+ raise ImportError(
+ "this version of pandas is incompatible with "
+ "numpy < 1.13.3\n"
+ "your numpy version is {0}.\n"
+ "Please upgrade numpy to >= 1.13.3 to use "
+ "this pandas version".format(_np_version)
+ )
-_tz_regex = re.compile('[+-]0000$')
+_tz_regex = re.compile("[+-]0000$")
def tz_replacer(s):
if isinstance(s, str):
- if s.endswith('Z'):
+ if s.endswith("Z"):
s = s[:-1]
elif _tz_regex.search(s):
s = s[:-5]
@@ -53,7 +55,7 @@ def np_array_datetime64_compat(arr, *args, **kwargs):
warning, when need to pass '2015-01-01 09:00:00'
"""
# is_list_like
- if (hasattr(arr, '__iter__') and not isinstance(arr, (str, bytes))):
+ if hasattr(arr, "__iter__") and not isinstance(arr, (str, bytes)):
arr = [tz_replacer(s) for s in arr]
else:
arr = tz_replacer(arr)
@@ -61,11 +63,12 @@ def np_array_datetime64_compat(arr, *args, **kwargs):
return np.array(arr, *args, **kwargs)
-__all__ = ['np',
- '_np_version',
- '_np_version_under1p14',
- '_np_version_under1p15',
- '_np_version_under1p16',
- '_np_version_under1p17',
- '_is_numpy_dev'
- ]
+__all__ = [
+ "np",
+ "_np_version",
+ "_np_version_under1p14",
+ "_np_version_under1p15",
+ "_np_version_under1p16",
+ "_np_version_under1p17",
+ "_is_numpy_dev",
+]
diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py
index 572dd7272986b..840dec2489a52 100644
--- a/pandas/compat/numpy/function.py
+++ b/pandas/compat/numpy/function.py
@@ -26,45 +26,50 @@
from pandas._libs.lib import is_bool, is_integer
from pandas.errors import UnsupportedFunctionCall
from pandas.util._validators import (
- validate_args, validate_args_and_kwargs, validate_kwargs)
+ validate_args,
+ validate_args_and_kwargs,
+ validate_kwargs,
+)
class CompatValidator:
-
- def __init__(self, defaults, fname=None, method=None,
- max_fname_arg_count=None):
+ def __init__(self, defaults, fname=None, method=None, max_fname_arg_count=None):
self.fname = fname
self.method = method
self.defaults = defaults
self.max_fname_arg_count = max_fname_arg_count
- def __call__(self, args, kwargs, fname=None,
- max_fname_arg_count=None, method=None):
+ def __call__(self, args, kwargs, fname=None, max_fname_arg_count=None, method=None):
if args or kwargs:
fname = self.fname if fname is None else fname
- max_fname_arg_count = (self.max_fname_arg_count if
- max_fname_arg_count is None
- else max_fname_arg_count)
+ max_fname_arg_count = (
+ self.max_fname_arg_count
+ if max_fname_arg_count is None
+ else max_fname_arg_count
+ )
method = self.method if method is None else method
- if method == 'args':
+ if method == "args":
validate_args(fname, args, max_fname_arg_count, self.defaults)
- elif method == 'kwargs':
+ elif method == "kwargs":
validate_kwargs(fname, kwargs, self.defaults)
- elif method == 'both':
- validate_args_and_kwargs(fname, args, kwargs,
- max_fname_arg_count,
- self.defaults)
+ elif method == "both":
+ validate_args_and_kwargs(
+ fname, args, kwargs, max_fname_arg_count, self.defaults
+ )
else:
- raise ValueError("invalid validation method "
- "'{method}'".format(method=method))
+ raise ValueError(
+ "invalid validation method " "'{method}'".format(method=method)
+ )
ARGMINMAX_DEFAULTS = dict(out=None)
-validate_argmin = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmin',
- method='both', max_fname_arg_count=1)
-validate_argmax = CompatValidator(ARGMINMAX_DEFAULTS, fname='argmax',
- method='both', max_fname_arg_count=1)
+validate_argmin = CompatValidator(
+ ARGMINMAX_DEFAULTS, fname="argmin", method="both", max_fname_arg_count=1
+)
+validate_argmax = CompatValidator(
+ ARGMINMAX_DEFAULTS, fname="argmax", method="both", max_fname_arg_count=1
+)
def process_skipna(skipna, args):
@@ -103,28 +108,30 @@ def validate_argmax_with_skipna(skipna, args, kwargs):
return skipna
-ARGSORT_DEFAULTS = OrderedDict() \
- # type: OrderedDict[str, Optional[Union[int, str]]]
-ARGSORT_DEFAULTS['axis'] = -1
-ARGSORT_DEFAULTS['kind'] = 'quicksort'
-ARGSORT_DEFAULTS['order'] = None
+ARGSORT_DEFAULTS = OrderedDict()
+# type: OrderedDict[str, Optional[Union[int, str]]]
+ARGSORT_DEFAULTS["axis"] = -1
+ARGSORT_DEFAULTS["kind"] = "quicksort"
+ARGSORT_DEFAULTS["order"] = None
if LooseVersion(_np_version) >= LooseVersion("1.17.0"):
# GH-26361. NumPy added radix sort and changed default to None.
- ARGSORT_DEFAULTS['kind'] = None
+ ARGSORT_DEFAULTS["kind"] = None
-validate_argsort = CompatValidator(ARGSORT_DEFAULTS, fname='argsort',
- max_fname_arg_count=0, method='both')
+validate_argsort = CompatValidator(
+ ARGSORT_DEFAULTS, fname="argsort", max_fname_arg_count=0, method="both"
+)
# two different signatures of argsort, this second validation
# for when the `kind` param is supported
-ARGSORT_DEFAULTS_KIND = OrderedDict() \
- # type: OrderedDict[str, Optional[int]]
-ARGSORT_DEFAULTS_KIND['axis'] = -1
-ARGSORT_DEFAULTS_KIND['order'] = None
-validate_argsort_kind = CompatValidator(ARGSORT_DEFAULTS_KIND, fname='argsort',
- max_fname_arg_count=0, method='both')
+ARGSORT_DEFAULTS_KIND = OrderedDict()
+# type: OrderedDict[str, Optional[int]]
+ARGSORT_DEFAULTS_KIND["axis"] = -1
+ARGSORT_DEFAULTS_KIND["order"] = None
+validate_argsort_kind = CompatValidator(
+ ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both"
+)
def validate_argsort_with_ascending(ascending, args, kwargs):
@@ -145,8 +152,9 @@ def validate_argsort_with_ascending(ascending, args, kwargs):
CLIP_DEFAULTS = dict(out=None) # type Dict[str, Any]
-validate_clip = CompatValidator(CLIP_DEFAULTS, fname='clip',
- method='both', max_fname_arg_count=3)
+validate_clip = CompatValidator(
+ CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3
+)
def validate_clip_with_axis(axis, args, kwargs):
@@ -166,18 +174,21 @@ def validate_clip_with_axis(axis, args, kwargs):
COMPRESS_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any]
-COMPRESS_DEFAULTS['axis'] = None
-COMPRESS_DEFAULTS['out'] = None
-validate_compress = CompatValidator(COMPRESS_DEFAULTS, fname='compress',
- method='both', max_fname_arg_count=1)
+COMPRESS_DEFAULTS["axis"] = None
+COMPRESS_DEFAULTS["out"] = None
+validate_compress = CompatValidator(
+ COMPRESS_DEFAULTS, fname="compress", method="both", max_fname_arg_count=1
+)
CUM_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Any]
-CUM_FUNC_DEFAULTS['dtype'] = None
-CUM_FUNC_DEFAULTS['out'] = None
-validate_cum_func = CompatValidator(CUM_FUNC_DEFAULTS, method='both',
- max_fname_arg_count=1)
-validate_cumsum = CompatValidator(CUM_FUNC_DEFAULTS, fname='cumsum',
- method='both', max_fname_arg_count=1)
+CUM_FUNC_DEFAULTS["dtype"] = None
+CUM_FUNC_DEFAULTS["out"] = None
+validate_cum_func = CompatValidator(
+ CUM_FUNC_DEFAULTS, method="both", max_fname_arg_count=1
+)
+validate_cumsum = CompatValidator(
+ CUM_FUNC_DEFAULTS, fname="cumsum", method="both", max_fname_arg_count=1
+)
def validate_cum_func_with_skipna(skipna, args, kwargs, name):
@@ -196,81 +207,88 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name):
ALLANY_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[bool]]
-ALLANY_DEFAULTS['dtype'] = None
-ALLANY_DEFAULTS['out'] = None
-ALLANY_DEFAULTS['keepdims'] = False
-validate_all = CompatValidator(ALLANY_DEFAULTS, fname='all',
- method='both', max_fname_arg_count=1)
-validate_any = CompatValidator(ALLANY_DEFAULTS, fname='any',
- method='both', max_fname_arg_count=1)
+ALLANY_DEFAULTS["dtype"] = None
+ALLANY_DEFAULTS["out"] = None
+ALLANY_DEFAULTS["keepdims"] = False
+validate_all = CompatValidator(
+ ALLANY_DEFAULTS, fname="all", method="both", max_fname_arg_count=1
+)
+validate_any = CompatValidator(
+ ALLANY_DEFAULTS, fname="any", method="both", max_fname_arg_count=1
+)
LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False)
-validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs')
+validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs")
MINMAX_DEFAULTS = dict(out=None, keepdims=False)
-validate_min = CompatValidator(MINMAX_DEFAULTS, fname='min',
- method='both', max_fname_arg_count=1)
-validate_max = CompatValidator(MINMAX_DEFAULTS, fname='max',
- method='both', max_fname_arg_count=1)
-
-RESHAPE_DEFAULTS = dict(order='C') # type: Dict[str, str]
-validate_reshape = CompatValidator(RESHAPE_DEFAULTS, fname='reshape',
- method='both', max_fname_arg_count=1)
+validate_min = CompatValidator(
+ MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1
+)
+validate_max = CompatValidator(
+ MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1
+)
+
+RESHAPE_DEFAULTS = dict(order="C") # type: Dict[str, str]
+validate_reshape = CompatValidator(
+ RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1
+)
REPEAT_DEFAULTS = dict(axis=None) # type: Dict[str, Any]
-validate_repeat = CompatValidator(REPEAT_DEFAULTS, fname='repeat',
- method='both', max_fname_arg_count=1)
+validate_repeat = CompatValidator(
+ REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1
+)
ROUND_DEFAULTS = dict(out=None) # type: Dict[str, Any]
-validate_round = CompatValidator(ROUND_DEFAULTS, fname='round',
- method='both', max_fname_arg_count=1)
+validate_round = CompatValidator(
+ ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1
+)
-SORT_DEFAULTS = OrderedDict() \
- # type: OrderedDict[str, Optional[Union[int, str]]]
-SORT_DEFAULTS['axis'] = -1
-SORT_DEFAULTS['kind'] = 'quicksort'
-SORT_DEFAULTS['order'] = None
-validate_sort = CompatValidator(SORT_DEFAULTS, fname='sort',
- method='kwargs')
+SORT_DEFAULTS = OrderedDict()
+# type: OrderedDict[str, Optional[Union[int, str]]]
+SORT_DEFAULTS["axis"] = -1
+SORT_DEFAULTS["kind"] = "quicksort"
+SORT_DEFAULTS["order"] = None
+validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs")
STAT_FUNC_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[Any]]
-STAT_FUNC_DEFAULTS['dtype'] = None
-STAT_FUNC_DEFAULTS['out'] = None
+STAT_FUNC_DEFAULTS["dtype"] = None
+STAT_FUNC_DEFAULTS["out"] = None
PROD_DEFAULTS = SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
-SUM_DEFAULTS['keepdims'] = False
-SUM_DEFAULTS['initial'] = None
+SUM_DEFAULTS["keepdims"] = False
+SUM_DEFAULTS["initial"] = None
MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
-MEDIAN_DEFAULTS['overwrite_input'] = False
-MEDIAN_DEFAULTS['keepdims'] = False
-
-STAT_FUNC_DEFAULTS['keepdims'] = False
-
-validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS,
- method='kwargs')
-validate_sum = CompatValidator(SUM_DEFAULTS, fname='sum',
- method='both', max_fname_arg_count=1)
-validate_prod = CompatValidator(PROD_DEFAULTS, fname="prod",
- method="both", max_fname_arg_count=1)
-validate_mean = CompatValidator(STAT_FUNC_DEFAULTS, fname='mean',
- method='both', max_fname_arg_count=1)
-validate_median = CompatValidator(MEDIAN_DEFAULTS, fname='median',
- method='both', max_fname_arg_count=1)
-
-STAT_DDOF_FUNC_DEFAULTS = OrderedDict() \
- # type: OrderedDict[str, Optional[bool]]
-STAT_DDOF_FUNC_DEFAULTS['dtype'] = None
-STAT_DDOF_FUNC_DEFAULTS['out'] = None
-STAT_DDOF_FUNC_DEFAULTS['keepdims'] = False
-validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS,
- method='kwargs')
+MEDIAN_DEFAULTS["overwrite_input"] = False
+MEDIAN_DEFAULTS["keepdims"] = False
+
+STAT_FUNC_DEFAULTS["keepdims"] = False
+
+validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS, method="kwargs")
+validate_sum = CompatValidator(
+ SUM_DEFAULTS, fname="sum", method="both", max_fname_arg_count=1
+)
+validate_prod = CompatValidator(
+ PROD_DEFAULTS, fname="prod", method="both", max_fname_arg_count=1
+)
+validate_mean = CompatValidator(
+ STAT_FUNC_DEFAULTS, fname="mean", method="both", max_fname_arg_count=1
+)
+validate_median = CompatValidator(
+ MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1
+)
+
+STAT_DDOF_FUNC_DEFAULTS = OrderedDict()
+# type: OrderedDict[str, Optional[bool]]
+STAT_DDOF_FUNC_DEFAULTS["dtype"] = None
+STAT_DDOF_FUNC_DEFAULTS["out"] = None
+STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False
+validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs")
TAKE_DEFAULTS = OrderedDict() # type: OrderedDict[str, Optional[str]]
-TAKE_DEFAULTS['out'] = None
-TAKE_DEFAULTS['mode'] = 'raise'
-validate_take = CompatValidator(TAKE_DEFAULTS, fname='take',
- method='kwargs')
+TAKE_DEFAULTS["out"] = None
+TAKE_DEFAULTS["mode"] = "raise"
+validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs")
def validate_take_with_convert(convert, args, kwargs):
@@ -285,20 +303,23 @@ def validate_take_with_convert(convert, args, kwargs):
args = (convert,) + args
convert = True
- validate_take(args, kwargs, max_fname_arg_count=3, method='both')
+ validate_take(args, kwargs, max_fname_arg_count=3, method="both")
return convert
TRANSPOSE_DEFAULTS = dict(axes=None)
-validate_transpose = CompatValidator(TRANSPOSE_DEFAULTS, fname='transpose',
- method='both', max_fname_arg_count=0)
+validate_transpose = CompatValidator(
+ TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0
+)
def validate_window_func(name, args, kwargs):
- numpy_args = ('axis', 'dtype', 'out')
- msg = ("numpy operations are not "
- "valid with window objects. "
- "Use .{func}() directly instead ".format(func=name))
+ numpy_args = ("axis", "dtype", "out")
+ msg = (
+ "numpy operations are not "
+ "valid with window objects. "
+ "Use .{func}() directly instead ".format(func=name)
+ )
if len(args) > 0:
raise UnsupportedFunctionCall(msg)
@@ -309,10 +330,12 @@ def validate_window_func(name, args, kwargs):
def validate_rolling_func(name, args, kwargs):
- numpy_args = ('axis', 'dtype', 'out')
- msg = ("numpy operations are not "
- "valid with window objects. "
- "Use .rolling(...).{func}() instead ".format(func=name))
+ numpy_args = ("axis", "dtype", "out")
+ msg = (
+ "numpy operations are not "
+ "valid with window objects. "
+ "Use .rolling(...).{func}() instead ".format(func=name)
+ )
if len(args) > 0:
raise UnsupportedFunctionCall(msg)
@@ -323,10 +346,12 @@ def validate_rolling_func(name, args, kwargs):
def validate_expanding_func(name, args, kwargs):
- numpy_args = ('axis', 'dtype', 'out')
- msg = ("numpy operations are not "
- "valid with window objects. "
- "Use .expanding(...).{func}() instead ".format(func=name))
+ numpy_args = ("axis", "dtype", "out")
+ msg = (
+ "numpy operations are not "
+ "valid with window objects. "
+ "Use .expanding(...).{func}() instead ".format(func=name)
+ )
if len(args) > 0:
raise UnsupportedFunctionCall(msg)
@@ -349,14 +374,16 @@ def validate_groupby_func(name, args, kwargs, allowed=None):
kwargs = set(kwargs) - set(allowed)
if len(args) + len(kwargs) > 0:
- raise UnsupportedFunctionCall((
- "numpy operations are not valid "
- "with groupby. Use .groupby(...)."
- "{func}() instead".format(func=name)))
+ raise UnsupportedFunctionCall(
+ (
+ "numpy operations are not valid "
+ "with groupby. Use .groupby(...)."
+ "{func}() instead".format(func=name)
+ )
+ )
-RESAMPLER_NUMPY_OPS = ('min', 'max', 'sum', 'prod',
- 'mean', 'std', 'var')
+RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var")
def validate_resampler_func(method, args, kwargs):
@@ -367,10 +394,13 @@ def validate_resampler_func(method, args, kwargs):
"""
if len(args) + len(kwargs) > 0:
if method in RESAMPLER_NUMPY_OPS:
- raise UnsupportedFunctionCall((
- "numpy operations are not valid "
- "with resample. Use .resample(...)."
- "{func}() instead".format(func=method)))
+ raise UnsupportedFunctionCall(
+ (
+ "numpy operations are not valid "
+ "with resample. Use .resample(...)."
+ "{func}() instead".format(func=method)
+ )
+ )
else:
raise TypeError("too many arguments passed in")
@@ -392,5 +422,7 @@ def validate_minmax_axis(axis):
if axis is None:
return
if axis >= ndim or (axis < 0 and ndim + axis < 0):
- raise ValueError("`axis` must be fewer than the number of "
- "dimensions ({ndim})".format(ndim=ndim))
+ raise ValueError(
+ "`axis` must be fewer than the number of "
+ "dimensions ({ndim})".format(ndim=ndim)
+ )
diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
index 3b63cbf1cfabb..0934d8529fdf7 100644
--- a/pandas/compat/pickle_compat.py
+++ b/pandas/compat/pickle_compat.py
@@ -26,7 +26,7 @@ def load_reduce(self):
# If we have a deprecated function,
# try to replace and try again.
- msg = '_reconstruct: First argument must be a sub-type of ndarray'
+ msg = "_reconstruct: First argument must be a sub-type of ndarray"
if msg in str(e):
try:
@@ -37,10 +37,11 @@ def load_reduce(self):
pass
# try to re-encode the arguments
- if getattr(self, 'encoding', None) is not None:
- args = tuple(arg.encode(self.encoding)
- if isinstance(arg, str)
- else arg for arg in args)
+ if getattr(self, "encoding", None) is not None:
+ args = tuple(
+ arg.encode(self.encoding) if isinstance(arg, str) else arg
+ for arg in args
+ )
try:
stack[-1] = func(*args)
return
@@ -48,7 +49,7 @@ def load_reduce(self):
pass
# unknown exception, re-raise
- if getattr(self, 'is_verbose', None):
+ if getattr(self, "is_verbose", None):
print(sys.exc_info())
print(func, args)
raise
@@ -56,9 +57,7 @@ def load_reduce(self):
# If classes are moved, provide compat here.
_class_locations_map = {
- ('pandas.core.sparse.array', 'SparseArray'):
- ('pandas.core.arrays', 'SparseArray'),
-
+ ("pandas.core.sparse.array", "SparseArray"): ("pandas.core.arrays", "SparseArray"),
# 15477
#
# TODO: When FrozenNDArray is removed, add
@@ -71,75 +70,84 @@ def load_reduce(self):
#
# Afterwards, remove the current entry
# for `pandas.core.base.FrozenNDArray`.
- ('pandas.core.base', 'FrozenNDArray'):
- ('pandas.core.indexes.frozen', 'FrozenNDArray'),
- ('pandas.core.base', 'FrozenList'):
- ('pandas.core.indexes.frozen', 'FrozenList'),
-
+ ("pandas.core.base", "FrozenNDArray"): (
+ "pandas.core.indexes.frozen",
+ "FrozenNDArray",
+ ),
+ ("pandas.core.base", "FrozenList"): ("pandas.core.indexes.frozen", "FrozenList"),
# 10890
- ('pandas.core.series', 'TimeSeries'):
- ('pandas.core.series', 'Series'),
- ('pandas.sparse.series', 'SparseTimeSeries'):
- ('pandas.core.sparse.series', 'SparseSeries'),
-
+ ("pandas.core.series", "TimeSeries"): ("pandas.core.series", "Series"),
+ ("pandas.sparse.series", "SparseTimeSeries"): (
+ "pandas.core.sparse.series",
+ "SparseSeries",
+ ),
# 12588, extensions moving
- ('pandas._sparse', 'BlockIndex'):
- ('pandas._libs.sparse', 'BlockIndex'),
- ('pandas.tslib', 'Timestamp'):
- ('pandas._libs.tslib', 'Timestamp'),
-
+ ("pandas._sparse", "BlockIndex"): ("pandas._libs.sparse", "BlockIndex"),
+ ("pandas.tslib", "Timestamp"): ("pandas._libs.tslib", "Timestamp"),
# 18543 moving period
- ('pandas._period', 'Period'): ('pandas._libs.tslibs.period', 'Period'),
- ('pandas._libs.period', 'Period'):
- ('pandas._libs.tslibs.period', 'Period'),
-
+ ("pandas._period", "Period"): ("pandas._libs.tslibs.period", "Period"),
+ ("pandas._libs.period", "Period"): ("pandas._libs.tslibs.period", "Period"),
# 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype
- ('pandas.tslib', '__nat_unpickle'):
- ('pandas._libs.tslibs.nattype', '__nat_unpickle'),
- ('pandas._libs.tslib', '__nat_unpickle'):
- ('pandas._libs.tslibs.nattype', '__nat_unpickle'),
-
+ ("pandas.tslib", "__nat_unpickle"): (
+ "pandas._libs.tslibs.nattype",
+ "__nat_unpickle",
+ ),
+ ("pandas._libs.tslib", "__nat_unpickle"): (
+ "pandas._libs.tslibs.nattype",
+ "__nat_unpickle",
+ ),
# 15998 top-level dirs moving
- ('pandas.sparse.array', 'SparseArray'):
- ('pandas.core.arrays.sparse', 'SparseArray'),
- ('pandas.sparse.series', 'SparseSeries'):
- ('pandas.core.sparse.series', 'SparseSeries'),
- ('pandas.sparse.frame', 'SparseDataFrame'):
- ('pandas.core.sparse.frame', 'SparseDataFrame'),
- ('pandas.indexes.base', '_new_Index'):
- ('pandas.core.indexes.base', '_new_Index'),
- ('pandas.indexes.base', 'Index'):
- ('pandas.core.indexes.base', 'Index'),
- ('pandas.indexes.numeric', 'Int64Index'):
- ('pandas.core.indexes.numeric', 'Int64Index'),
- ('pandas.indexes.range', 'RangeIndex'):
- ('pandas.core.indexes.range', 'RangeIndex'),
- ('pandas.indexes.multi', 'MultiIndex'):
- ('pandas.core.indexes.multi', 'MultiIndex'),
- ('pandas.tseries.index', '_new_DatetimeIndex'):
- ('pandas.core.indexes.datetimes', '_new_DatetimeIndex'),
- ('pandas.tseries.index', 'DatetimeIndex'):
- ('pandas.core.indexes.datetimes', 'DatetimeIndex'),
- ('pandas.tseries.period', 'PeriodIndex'):
- ('pandas.core.indexes.period', 'PeriodIndex'),
-
+ ("pandas.sparse.array", "SparseArray"): (
+ "pandas.core.arrays.sparse",
+ "SparseArray",
+ ),
+ ("pandas.sparse.series", "SparseSeries"): (
+ "pandas.core.sparse.series",
+ "SparseSeries",
+ ),
+ ("pandas.sparse.frame", "SparseDataFrame"): (
+ "pandas.core.sparse.frame",
+ "SparseDataFrame",
+ ),
+ ("pandas.indexes.base", "_new_Index"): ("pandas.core.indexes.base", "_new_Index"),
+ ("pandas.indexes.base", "Index"): ("pandas.core.indexes.base", "Index"),
+ ("pandas.indexes.numeric", "Int64Index"): (
+ "pandas.core.indexes.numeric",
+ "Int64Index",
+ ),
+ ("pandas.indexes.range", "RangeIndex"): ("pandas.core.indexes.range", "RangeIndex"),
+ ("pandas.indexes.multi", "MultiIndex"): ("pandas.core.indexes.multi", "MultiIndex"),
+ ("pandas.tseries.index", "_new_DatetimeIndex"): (
+ "pandas.core.indexes.datetimes",
+ "_new_DatetimeIndex",
+ ),
+ ("pandas.tseries.index", "DatetimeIndex"): (
+ "pandas.core.indexes.datetimes",
+ "DatetimeIndex",
+ ),
+ ("pandas.tseries.period", "PeriodIndex"): (
+ "pandas.core.indexes.period",
+ "PeriodIndex",
+ ),
# 19269, arrays moving
- ('pandas.core.categorical', 'Categorical'):
- ('pandas.core.arrays', 'Categorical'),
-
+ ("pandas.core.categorical", "Categorical"): ("pandas.core.arrays", "Categorical"),
# 19939, add timedeltaindex, float64index compat from 15998 move
- ('pandas.tseries.tdi', 'TimedeltaIndex'):
- ('pandas.core.indexes.timedeltas', 'TimedeltaIndex'),
- ('pandas.indexes.numeric', 'Float64Index'):
- ('pandas.core.indexes.numeric', 'Float64Index'),
+ ("pandas.tseries.tdi", "TimedeltaIndex"): (
+ "pandas.core.indexes.timedeltas",
+ "TimedeltaIndex",
+ ),
+ ("pandas.indexes.numeric", "Float64Index"): (
+ "pandas.core.indexes.numeric",
+ "Float64Index",
+ ),
}
# our Unpickler sub-class to override methods and some dispatcher
# functions for compat
-class Unpickler(pkl._Unpickler): # type: ignore
+class Unpickler(pkl._Unpickler): # type: ignore
def find_class(self, module, name):
# override superclass
key = (module, name)
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 058361af343b6..29833ab2fc0fa 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -24,68 +24,70 @@
# or `deadline=None` to entirely disable timeouts for that test.
deadline=500,
timeout=hypothesis.unlimited,
- suppress_health_check=(hypothesis.HealthCheck.too_slow,)
+ suppress_health_check=(hypothesis.HealthCheck.too_slow,),
)
hypothesis.settings.load_profile("ci")
def pytest_addoption(parser):
- parser.addoption("--skip-slow", action="store_true",
- help="skip slow tests")
- parser.addoption("--skip-network", action="store_true",
- help="skip network tests")
- parser.addoption("--skip-db", action="store_true",
- help="skip db tests")
- parser.addoption("--run-high-memory", action="store_true",
- help="run high memory tests")
- parser.addoption("--only-slow", action="store_true",
- help="run only slow tests")
- parser.addoption("--strict-data-files", action="store_true",
- help="Fail if a test is skipped for missing data file.")
+ parser.addoption("--skip-slow", action="store_true", help="skip slow tests")
+ parser.addoption("--skip-network", action="store_true", help="skip network tests")
+ parser.addoption("--skip-db", action="store_true", help="skip db tests")
+ parser.addoption(
+ "--run-high-memory", action="store_true", help="run high memory tests"
+ )
+ parser.addoption("--only-slow", action="store_true", help="run only slow tests")
+ parser.addoption(
+ "--strict-data-files",
+ action="store_true",
+ help="Fail if a test is skipped for missing data file.",
+ )
def pytest_runtest_setup(item):
- if 'slow' in item.keywords and item.config.getoption("--skip-slow"):
+ if "slow" in item.keywords and item.config.getoption("--skip-slow"):
pytest.skip("skipping due to --skip-slow")
- if 'slow' not in item.keywords and item.config.getoption("--only-slow"):
+ if "slow" not in item.keywords and item.config.getoption("--only-slow"):
pytest.skip("skipping due to --only-slow")
- if 'network' in item.keywords and item.config.getoption("--skip-network"):
+ if "network" in item.keywords and item.config.getoption("--skip-network"):
pytest.skip("skipping due to --skip-network")
- if 'db' in item.keywords and item.config.getoption("--skip-db"):
+ if "db" in item.keywords and item.config.getoption("--skip-db"):
pytest.skip("skipping due to --skip-db")
- if 'high_memory' in item.keywords and not item.config.getoption(
- "--run-high-memory"):
- pytest.skip(
- "skipping high memory test since --run-high-memory was not set")
+ if "high_memory" in item.keywords and not item.config.getoption(
+ "--run-high-memory"
+ ):
+ pytest.skip("skipping high memory test since --run-high-memory was not set")
# Configurations for all tests and all test modules
+
@pytest.fixture(autouse=True)
def configure_tests():
- pd.set_option('chained_assignment', 'raise')
+ pd.set_option("chained_assignment", "raise")
# For running doctests: make np and pd names available
+
@pytest.fixture(autouse=True)
def add_imports(doctest_namespace):
- doctest_namespace['np'] = np
- doctest_namespace['pd'] = pd
+ doctest_namespace["np"] = np
+ doctest_namespace["pd"] = pd
-@pytest.fixture(params=['bsr', 'coo', 'csc', 'csr', 'dia', 'dok', 'lil'])
+@pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"])
def spmatrix(request):
from scipy import sparse
- return getattr(sparse, request.param + '_matrix')
+
+ return getattr(sparse, request.param + "_matrix")
-@pytest.fixture(params=[0, 1, 'index', 'columns'],
- ids=lambda x: "axis {!r}".format(x))
+@pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: "axis {!r}".format(x))
def axis(request):
"""
Fixture for returning the axis numbers of a DataFrame.
@@ -96,7 +98,7 @@ def axis(request):
axis_frame = axis
-@pytest.fixture(params=[0, 'index'], ids=lambda x: "axis {!r}".format(x))
+@pytest.fixture(params=[0, "index"], ids=lambda x: "axis {!r}".format(x))
def axis_series(request):
"""
Fixture for returning the axis numbers of a Series.
@@ -112,8 +114,9 @@ def ip():
Will raise a skip if IPython is not installed.
"""
- pytest.importorskip('IPython', minversion="6.0.0")
+ pytest.importorskip("IPython", minversion="6.0.0")
from IPython.core.interactiveshell import InteractiveShell
+
return InteractiveShell()
@@ -134,13 +137,22 @@ def ordered_fixture(request):
return request.param
-_all_arithmetic_operators = ['__add__', '__radd__',
- '__sub__', '__rsub__',
- '__mul__', '__rmul__',
- '__floordiv__', '__rfloordiv__',
- '__truediv__', '__rtruediv__',
- '__pow__', '__rpow__',
- '__mod__', '__rmod__']
+_all_arithmetic_operators = [
+ "__add__",
+ "__radd__",
+ "__sub__",
+ "__rsub__",
+ "__mul__",
+ "__rmul__",
+ "__floordiv__",
+ "__rfloordiv__",
+ "__truediv__",
+ "__rtruediv__",
+ "__pow__",
+ "__rpow__",
+ "__mod__",
+ "__rmod__",
+]
@pytest.fixture(params=_all_arithmetic_operators)
@@ -151,9 +163,18 @@ def all_arithmetic_operators(request):
return request.param
-_all_numeric_reductions = ['sum', 'max', 'min',
- 'mean', 'prod', 'std', 'var', 'median',
- 'kurt', 'skew']
+_all_numeric_reductions = [
+ "sum",
+ "max",
+ "min",
+ "mean",
+ "prod",
+ "std",
+ "var",
+ "median",
+ "kurt",
+ "skew",
+]
@pytest.fixture(params=_all_numeric_reductions)
@@ -164,7 +185,7 @@ def all_numeric_reductions(request):
return request.param
-_all_boolean_reductions = ['all', 'any']
+_all_boolean_reductions = ["all", "any"]
@pytest.fixture(params=_all_boolean_reductions)
@@ -202,13 +223,15 @@ def _get_cython_table_params(ndframe, func_names_and_expected):
results = []
for func_name, expected in func_names_and_expected:
results.append((ndframe, func_name, expected))
- results += [(ndframe, func, expected) for func, name in _cython_table
- if name == func_name]
+ results += [
+ (ndframe, func, expected)
+ for func, name in _cython_table
+ if name == func_name
+ ]
return results
-@pytest.fixture(params=['__eq__', '__ne__', '__le__',
- '__lt__', '__ge__', '__gt__'])
+@pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"])
def all_compare_operators(request):
"""
Fixture for dunder names for common compare operations
@@ -223,7 +246,7 @@ def all_compare_operators(request):
return request.param
-@pytest.fixture(params=['__le__', '__lt__', '__ge__', '__gt__'])
+@pytest.fixture(params=["__le__", "__lt__", "__ge__", "__gt__"])
def compare_operators_no_eq_ne(request):
"""
Fixture for dunder names for compare operations except == and !=
@@ -236,7 +259,7 @@ def compare_operators_no_eq_ne(request):
return request.param
-@pytest.fixture(params=[None, 'gzip', 'bz2', 'zip', 'xz'])
+@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"])
def compression(request):
"""
Fixture for trying common compression types in compression tests
@@ -244,7 +267,7 @@ def compression(request):
return request.param
-@pytest.fixture(params=['gzip', 'bz2', 'zip', 'xz'])
+@pytest.fixture(params=["gzip", "bz2", "zip", "xz"])
def compression_only(request):
"""
Fixture for trying common compression types in compression tests excluding
@@ -261,12 +284,12 @@ def writable(request):
return request.param
-@pytest.fixture(scope='module')
+@pytest.fixture(scope="module")
def datetime_tz_utc():
return timezone.utc
-@pytest.fixture(params=['utc', 'dateutil/UTC', utc, tzutc(), timezone.utc])
+@pytest.fixture(params=["utc", "dateutil/UTC", utc, tzutc(), timezone.utc])
def utc_fixture(request):
"""
Fixture to provide variants of UTC timezone strings and tzinfo objects
@@ -274,7 +297,7 @@ def utc_fixture(request):
return request.param
-@pytest.fixture(params=['inner', 'outer', 'left', 'right'])
+@pytest.fixture(params=["inner", "outer", "left", "right"])
def join_type(request):
"""
Fixture for trying all types of join operations
@@ -305,7 +328,7 @@ def datapath(strict_data_files):
ValueError
If the path doesn't exist and the --strict-data-files option is set.
"""
- BASE_PATH = os.path.join(os.path.dirname(__file__), 'tests')
+ BASE_PATH = os.path.join(os.path.dirname(__file__), "tests")
def deco(*args):
path = os.path.join(BASE_PATH, *args)
@@ -317,16 +340,17 @@ def deco(*args):
msg = "Could not find {}."
pytest.skip(msg.format(path))
return path
+
return deco
@pytest.fixture
def iris(datapath):
"""The iris dataset as a DataFrame."""
- return pd.read_csv(datapath('data', 'iris.csv'))
+ return pd.read_csv(datapath("data", "iris.csv"))
-@pytest.fixture(params=['nlargest', 'nsmallest'])
+@pytest.fixture(params=["nlargest", "nsmallest"])
def nselect_method(request):
"""
Fixture for trying all nselect methods
@@ -334,7 +358,7 @@ def nselect_method(request):
return request.param
-@pytest.fixture(params=['left', 'right', 'both', 'neither'])
+@pytest.fixture(params=["left", "right", "both", "neither"])
def closed(request):
"""
Fixture for trying all interval closed parameters
@@ -342,7 +366,7 @@ def closed(request):
return request.param
-@pytest.fixture(params=['left', 'right', 'both', 'neither'])
+@pytest.fixture(params=["left", "right", "both", "neither"])
def other_closed(request):
"""
Secondary closed fixture to allow parametrizing over all pairs of closed
@@ -350,7 +374,7 @@ def other_closed(request):
return request.param
-@pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')])
+@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), np.float("NaN")])
def nulls_fixture(request):
"""
Fixture for each null type in pandas
@@ -373,11 +397,22 @@ def unique_nulls_fixture(request):
unique_nulls_fixture2 = unique_nulls_fixture
-TIMEZONES = [None, 'UTC', 'US/Eastern', 'Asia/Tokyo', 'dateutil/US/Pacific',
- 'dateutil/Asia/Singapore', tzutc(), tzlocal(), FixedOffset(300),
- FixedOffset(0), FixedOffset(-300), timezone.utc,
- timezone(timedelta(hours=1)),
- timezone(timedelta(hours=-1), name='foo')]
+TIMEZONES = [
+ None,
+ "UTC",
+ "US/Eastern",
+ "Asia/Tokyo",
+ "dateutil/US/Pacific",
+ "dateutil/Asia/Singapore",
+ tzutc(),
+ tzlocal(),
+ FixedOffset(300),
+ FixedOffset(0),
+ FixedOffset(-300),
+ timezone.utc,
+ timezone(timedelta(hours=1)),
+ timezone(timedelta(hours=-1), name="foo"),
+]
TIMEZONE_IDS = [repr(i) for i in TIMEZONES]
@@ -416,19 +451,26 @@ def tz_aware_fixture(request):
FLOAT_DTYPES = [float, "float32", "float64"]
COMPLEX_DTYPES = [complex, "complex64", "complex128"]
-STRING_DTYPES = [str, 'str', 'U']
+STRING_DTYPES = [str, "str", "U"]
-DATETIME64_DTYPES = ['datetime64[ns]', 'M8[ns]']
-TIMEDELTA64_DTYPES = ['timedelta64[ns]', 'm8[ns]']
+DATETIME64_DTYPES = ["datetime64[ns]", "M8[ns]"]
+TIMEDELTA64_DTYPES = ["timedelta64[ns]", "m8[ns]"]
-BOOL_DTYPES = [bool, 'bool']
-BYTES_DTYPES = [bytes, 'bytes']
-OBJECT_DTYPES = [object, 'object']
+BOOL_DTYPES = [bool, "bool"]
+BYTES_DTYPES = [bytes, "bytes"]
+OBJECT_DTYPES = [object, "object"]
ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES
-ALL_NUMPY_DTYPES = (ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES +
- DATETIME64_DTYPES + TIMEDELTA64_DTYPES + BOOL_DTYPES +
- OBJECT_DTYPES + BYTES_DTYPES)
+ALL_NUMPY_DTYPES = (
+ ALL_REAL_DTYPES
+ + COMPLEX_DTYPES
+ + STRING_DTYPES
+ + DATETIME64_DTYPES
+ + TIMEDELTA64_DTYPES
+ + BOOL_DTYPES
+ + OBJECT_DTYPES
+ + BYTES_DTYPES
+)
@pytest.fixture(params=STRING_DTYPES)
@@ -618,29 +660,29 @@ def any_numpy_dtype(request):
# categoricals are handled separately
_any_skipna_inferred_dtype = [
- ('string', ['a', np.nan, 'c']),
- ('bytes', [b'a', np.nan, b'c']),
- ('empty', [np.nan, np.nan, np.nan]),
- ('empty', []),
- ('mixed-integer', ['a', np.nan, 2]),
- ('mixed', ['a', np.nan, 2.0]),
- ('floating', [1.0, np.nan, 2.0]),
- ('integer', [1, np.nan, 2]),
- ('mixed-integer-float', [1, np.nan, 2.0]),
- ('decimal', [Decimal(1), np.nan, Decimal(2)]),
- ('boolean', [True, np.nan, False]),
- ('datetime64', [np.datetime64('2013-01-01'), np.nan,
- np.datetime64('2018-01-01')]),
- ('datetime', [pd.Timestamp('20130101'), np.nan, pd.Timestamp('20180101')]),
- ('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]),
+ ("string", ["a", np.nan, "c"]),
+ ("bytes", [b"a", np.nan, b"c"]),
+ ("empty", [np.nan, np.nan, np.nan]),
+ ("empty", []),
+ ("mixed-integer", ["a", np.nan, 2]),
+ ("mixed", ["a", np.nan, 2.0]),
+ ("floating", [1.0, np.nan, 2.0]),
+ ("integer", [1, np.nan, 2]),
+ ("mixed-integer-float", [1, np.nan, 2.0]),
+ ("decimal", [Decimal(1), np.nan, Decimal(2)]),
+ ("boolean", [True, np.nan, False]),
+ ("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]),
+ ("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]),
+ ("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]),
# The following two dtypes are commented out due to GH 23554
# ('complex', [1 + 1j, np.nan, 2 + 2j]),
# ('timedelta64', [np.timedelta64(1, 'D'),
# np.nan, np.timedelta64(2, 'D')]),
- ('timedelta', [timedelta(1), np.nan, timedelta(2)]),
- ('time', [time(1), np.nan, time(2)]),
- ('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]),
- ('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])]
+ ("timedelta", [timedelta(1), np.nan, timedelta(2)]),
+ ("time", [time(1), np.nan, time(2)]),
+ ("period", [pd.Period(2013), pd.NaT, pd.Period(2018)]),
+ ("interval", [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)]),
+]
ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id
@@ -692,45 +734,55 @@ def any_skipna_inferred_dtype(request):
return inferred_dtype, values
-@pytest.fixture(params=[getattr(pd.offsets, o) for o in pd.offsets.__all__ if
- issubclass(getattr(pd.offsets, o), pd.offsets.Tick)])
+@pytest.fixture(
+ params=[
+ getattr(pd.offsets, o)
+ for o in pd.offsets.__all__
+ if issubclass(getattr(pd.offsets, o), pd.offsets.Tick)
+ ]
+)
def tick_classes(request):
"""
Fixture for Tick based datetime offsets available for a time series.
"""
return request.param
+
# ----------------------------------------------------------------
# Global setup for tests using Hypothesis
# Registering these strategies makes them globally available via st.from_type,
# which is use for offsets in tests/tseries/offsets/test_offsets_properties.py
-for name in 'MonthBegin MonthEnd BMonthBegin BMonthEnd'.split():
+for name in "MonthBegin MonthEnd BMonthBegin BMonthEnd".split():
cls = getattr(pd.tseries.offsets, name)
- st.register_type_strategy(cls, st.builds(
- cls,
- n=st.integers(-99, 99),
- normalize=st.booleans(),
- ))
+ st.register_type_strategy(
+ cls, st.builds(cls, n=st.integers(-99, 99), normalize=st.booleans())
+ )
-for name in 'YearBegin YearEnd BYearBegin BYearEnd'.split():
+for name in "YearBegin YearEnd BYearBegin BYearEnd".split():
cls = getattr(pd.tseries.offsets, name)
- st.register_type_strategy(cls, st.builds(
+ st.register_type_strategy(
cls,
- n=st.integers(-5, 5),
- normalize=st.booleans(),
- month=st.integers(min_value=1, max_value=12),
- ))
-
-for name in 'QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd'.split():
+ st.builds(
+ cls,
+ n=st.integers(-5, 5),
+ normalize=st.booleans(),
+ month=st.integers(min_value=1, max_value=12),
+ ),
+ )
+
+for name in "QuarterBegin QuarterEnd BQuarterBegin BQuarterEnd".split():
cls = getattr(pd.tseries.offsets, name)
- st.register_type_strategy(cls, st.builds(
+ st.register_type_strategy(
cls,
- n=st.integers(-24, 24),
- normalize=st.booleans(),
- startingMonth=st.integers(min_value=1, max_value=12)
- ))
+ st.builds(
+ cls,
+ n=st.integers(-24, 24),
+ normalize=st.booleans(),
+ startingMonth=st.integers(min_value=1, max_value=12),
+ ),
+ )
@pytest.fixture
diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py
index b092541da93e6..f84033e9c3c90 100644
--- a/pandas/core/accessor.py
+++ b/pandas/core/accessor.py
@@ -13,7 +13,8 @@
class DirNamesMixin:
_accessors = set() # type: Set[str]
_deprecations = frozenset(
- ['asobject', 'base', 'data', 'flags', 'itemsize', 'strides'])
+ ["asobject", "base", "data", "flags", "itemsize", "strides"]
+ )
def _dir_deletions(self):
"""
@@ -50,8 +51,7 @@ class PandasDelegate:
"""
def _delegate_property_get(self, name, *args, **kwargs):
- raise TypeError("You cannot access the "
- "property {name}".format(name=name))
+ raise TypeError("You cannot access the " "property {name}".format(name=name))
def _delegate_property_set(self, name, value, *args, **kwargs):
raise TypeError("The property {name} cannot be set".format(name=name))
@@ -60,8 +60,7 @@ def _delegate_method(self, name, *args, **kwargs):
raise TypeError("You cannot call method {name}".format(name=name))
@classmethod
- def _add_delegate_accessors(cls, delegate, accessors, typ,
- overwrite=False):
+ def _add_delegate_accessors(cls, delegate, accessors, typ, overwrite=False):
"""
Add accessors to cls from the delegate class.
@@ -76,7 +75,6 @@ def _add_delegate_accessors(cls, delegate, accessors, typ,
"""
def _create_delegator_property(name):
-
def _getter(self):
return self._delegate_property_get(name)
@@ -86,11 +84,11 @@ def _setter(self, new_values):
_getter.__name__ = name
_setter.__name__ = name
- return property(fget=_getter, fset=_setter,
- doc=getattr(delegate, name).__doc__)
+ return property(
+ fget=_getter, fset=_setter, doc=getattr(delegate, name).__doc__
+ )
def _create_delegator_method(name):
-
def f(self, *args, **kwargs):
return self._delegate_method(name, *args, **kwargs)
@@ -101,7 +99,7 @@ def f(self, *args, **kwargs):
for name in accessors:
- if typ == 'property':
+ if typ == "property":
f = _create_delegator_property(name)
else:
f = _create_delegator_method(name)
@@ -138,9 +136,9 @@ def delegate_names(delegate, accessors, typ, overwrite=False):
class CategoricalAccessor(PandasDelegate):
[...]
"""
+
def add_delegate_accessors(cls):
- cls._add_delegate_accessors(delegate, accessors, typ,
- overwrite=overwrite)
+ cls._add_delegate_accessors(delegate, accessors, typ, overwrite=overwrite)
return cls
return add_delegate_accessors
@@ -151,6 +149,7 @@ def add_delegate_accessors(cls):
# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors
# 2. We use a UserWarning instead of a custom Warning
+
class CachedAccessor:
"""
Custom property-like object (descriptor) for caching accessors.
@@ -164,6 +163,7 @@ class CachedAccessor:
should expect one of a ``Series``, ``DataFrame`` or ``Index`` as
the single argument ``data``
"""
+
def __init__(self, name, accessor):
self._name = name
self._accessor = accessor
@@ -185,14 +185,16 @@ def _register_accessor(name, cls):
def decorator(accessor):
if hasattr(cls, name):
warnings.warn(
- 'registration of accessor {!r} under name {!r} for type '
- '{!r} is overriding a preexisting attribute with the same '
- 'name.'.format(accessor, name, cls),
+ "registration of accessor {!r} under name {!r} for type "
+ "{!r} is overriding a preexisting attribute with the same "
+ "name.".format(accessor, name, cls),
UserWarning,
- stacklevel=2)
+ stacklevel=2,
+ )
setattr(cls, name, CachedAccessor(name, accessor))
cls._accessors.add(name)
return accessor
+
return decorator
@@ -266,25 +268,40 @@ def plot(self):
"""
-@Appender(_doc % dict(klass="DataFrame",
- others=("register_series_accessor, "
- "register_index_accessor")))
+@Appender(
+ _doc
+ % dict(
+ klass="DataFrame",
+ others=("register_series_accessor, " "register_index_accessor"),
+ )
+)
def register_dataframe_accessor(name):
from pandas import DataFrame
+
return _register_accessor(name, DataFrame)
-@Appender(_doc % dict(klass="Series",
- others=("register_dataframe_accessor, "
- "register_index_accessor")))
+@Appender(
+ _doc
+ % dict(
+ klass="Series",
+ others=("register_dataframe_accessor, " "register_index_accessor"),
+ )
+)
def register_series_accessor(name):
from pandas import Series
+
return _register_accessor(name, Series)
-@Appender(_doc % dict(klass="Index",
- others=("register_dataframe_accessor, "
- "register_series_accessor")))
+@Appender(
+ _doc
+ % dict(
+ klass="Index",
+ others=("register_dataframe_accessor, " "register_series_accessor"),
+ )
+)
def register_index_accessor(name):
from pandas import Index
+
return _register_accessor(name, Index)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 4e84d7b26b707..79f205de11878 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -13,16 +13,39 @@
from pandas.util._decorators import Appender, Substitution, deprecate_kwarg
from pandas.core.dtypes.cast import (
- construct_1d_object_array_from_listlike, maybe_promote)
+ construct_1d_object_array_from_listlike,
+ maybe_promote,
+)
from pandas.core.dtypes.common import (
- ensure_float64, ensure_int64, ensure_object, ensure_platform_int,
- ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype,
- is_complex_dtype, is_datetime64_any_dtype, is_datetime64_ns_dtype,
- is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype,
- is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype,
- is_list_like, is_numeric_dtype, is_object_dtype, is_period_dtype,
- is_scalar, is_signed_integer_dtype, is_sparse, is_timedelta64_dtype,
- is_unsigned_integer_dtype, needs_i8_conversion)
+ ensure_float64,
+ ensure_int64,
+ ensure_object,
+ ensure_platform_int,
+ ensure_uint64,
+ is_array_like,
+ is_bool_dtype,
+ is_categorical_dtype,
+ is_complex_dtype,
+ is_datetime64_any_dtype,
+ is_datetime64_ns_dtype,
+ is_datetime64tz_dtype,
+ is_datetimelike,
+ is_extension_array_dtype,
+ is_float_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_interval_dtype,
+ is_list_like,
+ is_numeric_dtype,
+ is_object_dtype,
+ is_period_dtype,
+ is_scalar,
+ is_signed_integer_dtype,
+ is_sparse,
+ is_timedelta64_dtype,
+ is_unsigned_integer_dtype,
+ needs_i8_conversion,
+)
from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, na_value_for_dtype
@@ -62,20 +85,19 @@ def _ensure_data(values, dtype=None):
# we check some simple dtypes first
try:
if is_object_dtype(dtype):
- return ensure_object(np.asarray(values)), 'object', 'object'
+ return ensure_object(np.asarray(values)), "object", "object"
if is_bool_dtype(values) or is_bool_dtype(dtype):
# we are actually coercing to uint64
# until our algos support uint8 directly (see TODO)
- return np.asarray(values).astype('uint64'), 'bool', 'uint64'
+ return np.asarray(values).astype("uint64"), "bool", "uint64"
elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
- return ensure_int64(values), 'int64', 'int64'
- elif (is_unsigned_integer_dtype(values) or
- is_unsigned_integer_dtype(dtype)):
- return ensure_uint64(values), 'uint64', 'uint64'
+ return ensure_int64(values), "int64", "int64"
+ elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype):
+ return ensure_uint64(values), "uint64", "uint64"
elif is_float_dtype(values) or is_float_dtype(dtype):
- return ensure_float64(values), 'float64', 'float64'
+ return ensure_float64(values), "float64", "float64"
elif is_object_dtype(values) and dtype is None:
- return ensure_object(np.asarray(values)), 'object', 'object'
+ return ensure_object(np.asarray(values)), "object", "object"
elif is_complex_dtype(values) or is_complex_dtype(dtype):
# ignore the fact that we are casting to float
@@ -83,24 +105,28 @@ def _ensure_data(values, dtype=None):
with catch_warnings():
simplefilter("ignore", np.ComplexWarning)
values = ensure_float64(values)
- return values, 'float64', 'float64'
+ return values, "float64", "float64"
except (TypeError, ValueError, OverflowError):
# if we are trying to coerce to a dtype
# and it is incompat this will fall thru to here
- return ensure_object(values), 'object', 'object'
+ return ensure_object(values), "object", "object"
# datetimelike
- if (needs_i8_conversion(values) or
- is_period_dtype(dtype) or
- is_datetime64_any_dtype(dtype) or
- is_timedelta64_dtype(dtype)):
+ if (
+ needs_i8_conversion(values)
+ or is_period_dtype(dtype)
+ or is_datetime64_any_dtype(dtype)
+ or is_timedelta64_dtype(dtype)
+ ):
if is_period_dtype(values) or is_period_dtype(dtype):
from pandas import PeriodIndex
+
values = PeriodIndex(values)
dtype = values.dtype
elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype):
from pandas import TimedeltaIndex
+
values = TimedeltaIndex(values)
dtype = values.dtype
else:
@@ -108,31 +134,33 @@ def _ensure_data(values, dtype=None):
if values.ndim > 1 and is_datetime64_ns_dtype(values):
# Avoid calling the DatetimeIndex constructor as it is 1D only
# Note: this is reached by DataFrame.rank calls GH#27027
- asi8 = values.view('i8')
+ asi8 = values.view("i8")
dtype = values.dtype
- return asi8, dtype, 'int64'
+ return asi8, dtype, "int64"
from pandas import DatetimeIndex
+
values = DatetimeIndex(values)
dtype = values.dtype
- return values.asi8, dtype, 'int64'
+ return values.asi8, dtype, "int64"
- elif (is_categorical_dtype(values) and
- (is_categorical_dtype(dtype) or dtype is None)):
- values = getattr(values, 'values', values)
+ elif is_categorical_dtype(values) and (
+ is_categorical_dtype(dtype) or dtype is None
+ ):
+ values = getattr(values, "values", values)
values = values.codes
- dtype = 'category'
+ dtype = "category"
# we are actually coercing to int64
# until our algos support int* directly (not all do)
values = ensure_int64(values)
- return values, dtype, 'int64'
+ return values, dtype, "int64"
# we have failed, return object
values = np.asarray(values, dtype=np.object)
- return ensure_object(values), 'object', 'object'
+ return ensure_object(values), "object", "object"
def _reconstruct_data(values, dtype, original):
@@ -150,6 +178,7 @@ def _reconstruct_data(values, dtype, original):
Index for extension types, otherwise ndarray casted to dtype
"""
from pandas import Index
+
if is_extension_array_dtype(dtype):
values = dtype.construct_array_type()._from_sequence(values)
elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
@@ -172,7 +201,7 @@ def _ensure_arraylike(values):
"""
if not is_array_like(values):
inferred = lib.infer_dtype(values, skipna=False)
- if inferred in ['mixed', 'string', 'unicode']:
+ if inferred in ["mixed", "string", "unicode"]:
if isinstance(values, tuple):
values = list(values)
values = construct_1d_object_array_from_listlike(values)
@@ -182,11 +211,11 @@ def _ensure_arraylike(values):
_hashtables = {
- 'float64': (htable.Float64HashTable, htable.Float64Vector),
- 'uint64': (htable.UInt64HashTable, htable.UInt64Vector),
- 'int64': (htable.Int64HashTable, htable.Int64Vector),
- 'string': (htable.StringHashTable, htable.ObjectVector),
- 'object': (htable.PyObjectHashTable, htable.ObjectVector)
+ "float64": (htable.Float64HashTable, htable.Float64Vector),
+ "uint64": (htable.UInt64HashTable, htable.UInt64Vector),
+ "int64": (htable.Int64HashTable, htable.Int64Vector),
+ "string": (htable.StringHashTable, htable.ObjectVector),
+ "object": (htable.PyObjectHashTable, htable.ObjectVector),
}
@@ -206,15 +235,15 @@ def _get_hashtable_algo(values):
"""
values, dtype, ndtype = _ensure_data(values)
- if ndtype == 'object':
+ if ndtype == "object":
# it's cheaper to use a String Hash Table than Object; we infer
# including nulls because that is the only difference between
# StringHashTable and ObjectHashtable
- if lib.infer_dtype(values, skipna=False) in ['string']:
- ndtype = 'string'
+ if lib.infer_dtype(values, skipna=False) in ["string"]:
+ ndtype = "string"
else:
- ndtype = 'object'
+ ndtype = "object"
htable, table = _hashtables[ndtype]
return (htable, table, values, dtype, ndtype)
@@ -226,15 +255,15 @@ def _get_data_algo(values, func_map):
values = values._values_for_rank()
values, dtype, ndtype = _ensure_data(values)
- if ndtype == 'object':
+ if ndtype == "object":
# it's cheaper to use a String Hash Table than Object; we infer
# including nulls because that is the only difference between
# StringHashTable and ObjectHashtable
- if lib.infer_dtype(values, skipna=False) in ['string']:
- ndtype = 'string'
+ if lib.infer_dtype(values, skipna=False) in ["string"]:
+ ndtype = "string"
- f = func_map.get(ndtype, func_map['object'])
+ f = func_map.get(ndtype, func_map["object"])
return f, values
@@ -243,6 +272,7 @@ def _get_data_algo(values, func_map):
# top-level algos #
# --------------- #
+
def match(to_match, values, na_sentinel=-1):
"""
Compute locations of to_match into values
@@ -275,6 +305,7 @@ def match(to_match, values, na_sentinel=-1):
# replace but return a numpy array
# use a Series because it handles dtype conversions properly
from pandas import Series
+
result = Series(result.ravel()).replace(-1, na_sentinel)
result = result.values.reshape(result.shape)
@@ -393,13 +424,19 @@ def isin(comps, values):
"""
if not is_list_like(comps):
- raise TypeError("only list-like objects are allowed to be passed"
- " to isin(), you passed a [{comps_type}]"
- .format(comps_type=type(comps).__name__))
+ raise TypeError(
+ "only list-like objects are allowed to be passed"
+ " to isin(), you passed a [{comps_type}]".format(
+ comps_type=type(comps).__name__
+ )
+ )
if not is_list_like(values):
- raise TypeError("only list-like objects are allowed to be passed"
- " to isin(), you passed a [{values_type}]"
- .format(values_type=type(values).__name__))
+ raise TypeError(
+ "only list-like objects are allowed to be passed"
+ " to isin(), you passed a [{values_type}]".format(
+ values_type=type(values).__name__
+ )
+ )
if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
values = construct_1d_object_array_from_listlike(list(values))
@@ -423,8 +460,8 @@ def isin(comps, values):
f = lambda x, y: np.in1d(x, y)
elif is_integer_dtype(comps):
try:
- values = values.astype('int64', copy=False)
- comps = comps.astype('int64', copy=False)
+ values = values.astype("int64", copy=False)
+ comps = comps.astype("int64", copy=False)
f = lambda x, y: htable.ismember_int64(x, y)
except (TypeError, ValueError, OverflowError):
values = values.astype(object)
@@ -432,8 +469,8 @@ def isin(comps, values):
elif is_float_dtype(comps):
try:
- values = values.astype('float64', copy=False)
- comps = comps.astype('float64', copy=False)
+ values = values.astype("float64", copy=False)
+ comps = comps.astype("float64", copy=False)
f = lambda x, y: htable.ismember_float64(x, y)
except (TypeError, ValueError):
values = values.astype(object)
@@ -442,8 +479,7 @@ def isin(comps, values):
return f(comps, values)
-def _factorize_array(values, na_sentinel=-1, size_hint=None,
- na_value=None):
+def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None):
"""Factorize an array-like to labels and uniques.
This doesn't do any coercion of types or unboxing before factorization.
@@ -467,14 +503,17 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
(hash_klass, _), values = _get_data_algo(values, _hashtables)
table = hash_klass(size_hint or len(values))
- uniques, labels = table.factorize(values, na_sentinel=na_sentinel,
- na_value=na_value)
+ uniques, labels = table.factorize(
+ values, na_sentinel=na_sentinel, na_value=na_value
+ )
labels = ensure_platform_int(labels)
return labels, uniques
-_shared_docs['factorize'] = """
+_shared_docs[
+ "factorize"
+] = """
Encode the object as an enumerated type or categorical variable.
This method is useful for obtaining a numeric representation of an
@@ -568,29 +607,37 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
@Substitution(
- values=dedent("""\
+ values=dedent(
+ """\
values : sequence
A 1-D sequence. Sequences that aren't pandas objects are
coerced to ndarrays before factorization.
- """),
- order=dedent("""\
+ """
+ ),
+ order=dedent(
+ """\
order : None
.. deprecated:: 0.23.0
This parameter has no effect and is deprecated.
- """),
- sort=dedent("""\
+ """
+ ),
+ sort=dedent(
+ """\
sort : bool, default False
Sort `uniques` and shuffle `labels` to maintain the
relationship.
- """),
- size_hint=dedent("""\
+ """
+ ),
+ size_hint=dedent(
+ """\
size_hint : int, optional
Hint to the hashtable sizer.
- """),
+ """
+ ),
)
-@Appender(_shared_docs['factorize'])
-@deprecate_kwarg(old_arg_name='order', new_arg_name=None)
+@Appender(_shared_docs["factorize"])
+@deprecate_kwarg(old_arg_name="order", new_arg_name=None)
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
# Implementation notes: This method is responsible for 3 things
# 1.) coercing data to array-like (ndarray, Index, extension array)
@@ -605,28 +652,31 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
original = values
if is_extension_array_dtype(values):
- values = getattr(values, '_values', values)
+ values = getattr(values, "_values", values)
labels, uniques = values.factorize(na_sentinel=na_sentinel)
dtype = original.dtype
else:
values, dtype, _ = _ensure_data(values)
- if (is_datetime64_any_dtype(original) or
- is_timedelta64_dtype(original) or
- is_period_dtype(original)):
+ if (
+ is_datetime64_any_dtype(original)
+ or is_timedelta64_dtype(original)
+ or is_period_dtype(original)
+ ):
na_value = na_value_for_dtype(original.dtype)
else:
na_value = None
- labels, uniques = _factorize_array(values,
- na_sentinel=na_sentinel,
- size_hint=size_hint,
- na_value=na_value)
+ labels, uniques = _factorize_array(
+ values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value
+ )
if sort and len(uniques) > 0:
from pandas.core.sorting import safe_sort
- uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
- assume_unique=True, verify=False)
+
+ uniques, labels = safe_sort(
+ uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False
+ )
uniques = _reconstruct_data(uniques, dtype, original)
@@ -635,13 +685,15 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
uniques = original._shallow_copy(uniques, name=None)
elif isinstance(original, ABCSeries):
from pandas import Index
+
uniques = Index(uniques)
return labels, uniques
-def value_counts(values, sort=True, ascending=False, normalize=False,
- bins=None, dropna=True):
+def value_counts(
+ values, sort=True, ascending=False, normalize=False, bins=None, dropna=True
+):
"""
Compute a histogram of the counts of non-null values.
@@ -666,11 +718,13 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
"""
from pandas.core.series import Series, Index
- name = getattr(values, 'name', None)
+
+ name = getattr(values, "name", None)
if bins is not None:
try:
from pandas.core.reshape.tile import cut
+
values = Series(values)
ii = cut(values, bins, include_lowest=True)
except TypeError:
@@ -679,7 +733,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
# count, remove nulls (from the index), and but the bins
result = ii.value_counts(dropna=dropna)
result = result[result.index.notna()]
- result.index = result.index.astype('interval')
+ result.index = result.index.astype("interval")
result = result.sort_index()
# if we are dropna and we have NO values
@@ -757,7 +811,7 @@ def _value_counts_arraylike(values, dropna):
return keys, counts
-def duplicated(values, keep='first'):
+def duplicated(values, keep="first"):
"""
Return boolean ndarray denoting duplicate values.
@@ -829,8 +883,7 @@ def mode(values, dropna=True):
return Series(result)
-def rank(values, axis=0, method='average', na_option='keep',
- ascending=True, pct=False):
+def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct=False):
"""
Rank the values along a given axis.
@@ -856,12 +909,23 @@ def rank(values, axis=0, method='average', na_option='keep',
"""
if values.ndim == 1:
f, values = _get_data_algo(values, _rank1d_functions)
- ranks = f(values, ties_method=method, ascending=ascending,
- na_option=na_option, pct=pct)
+ ranks = f(
+ values,
+ ties_method=method,
+ ascending=ascending,
+ na_option=na_option,
+ pct=pct,
+ )
elif values.ndim == 2:
f, values = _get_data_algo(values, _rank2d_functions)
- ranks = f(values, axis=axis, ties_method=method,
- ascending=ascending, na_option=na_option, pct=pct)
+ ranks = f(
+ values,
+ axis=axis,
+ ties_method=method,
+ ascending=ascending,
+ na_option=na_option,
+ pct=pct,
+ )
else:
raise TypeError("Array with ndim > 2 are not supported.")
@@ -932,10 +996,12 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
elif not mask2.any():
to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
else:
- to_raise = (((np.iinfo(np.int64).max -
- b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or
- ((np.iinfo(np.int64).min -
- b2[mask2] > arr[mask2]) & not_nan[mask2]).any())
+ to_raise = (
+ ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any()
+ or (
+ (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2]
+ ).any()
+ )
if to_raise:
raise OverflowError("Overflow in int64 addition")
@@ -943,21 +1009,21 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
_rank1d_functions = {
- 'float64': algos.rank_1d_float64,
- 'int64': algos.rank_1d_int64,
- 'uint64': algos.rank_1d_uint64,
- 'object': algos.rank_1d_object
+ "float64": algos.rank_1d_float64,
+ "int64": algos.rank_1d_int64,
+ "uint64": algos.rank_1d_uint64,
+ "object": algos.rank_1d_object,
}
_rank2d_functions = {
- 'float64': algos.rank_2d_float64,
- 'int64': algos.rank_2d_int64,
- 'uint64': algos.rank_2d_uint64,
- 'object': algos.rank_2d_object
+ "float64": algos.rank_2d_float64,
+ "int64": algos.rank_2d_int64,
+ "uint64": algos.rank_2d_uint64,
+ "object": algos.rank_2d_object,
}
-def quantile(x, q, interpolation_method='fraction'):
+def quantile(x, q, interpolation_method="fraction"):
"""
Compute sample quantile or quantiles of the input array. For example, q=0.5
computes the median.
@@ -1017,16 +1083,17 @@ def _get_score(at):
if idx % 1 == 0:
score = values[int(idx)]
else:
- if interpolation_method == 'fraction':
- score = _interpolate(values[int(idx)], values[int(idx) + 1],
- idx % 1)
- elif interpolation_method == 'lower':
+ if interpolation_method == "fraction":
+ score = _interpolate(values[int(idx)], values[int(idx) + 1], idx % 1)
+ elif interpolation_method == "lower":
score = values[np.floor(idx)]
- elif interpolation_method == 'higher':
+ elif interpolation_method == "higher":
score = values[np.ceil(idx)]
else:
- raise ValueError("interpolation_method can only be 'fraction' "
- ", 'lower' or 'higher'")
+ raise ValueError(
+ "interpolation_method can only be 'fraction' "
+ ", 'lower' or 'higher'"
+ )
return score
@@ -1041,21 +1108,21 @@ def _get_score(at):
# select n #
# --------------- #
-class SelectN:
+class SelectN:
def __init__(self, obj, n, keep):
self.obj = obj
self.n = n
self.keep = keep
- if self.keep not in ('first', 'last', 'all'):
+ if self.keep not in ("first", "last", "all"):
raise ValueError('keep must be either "first", "last" or "all"')
def nlargest(self):
- return self.compute('nlargest')
+ return self.compute("nlargest")
def nsmallest(self):
- return self.compute('nsmallest')
+ return self.compute("nsmallest")
@staticmethod
def is_valid_dtype_n_method(dtype):
@@ -1063,8 +1130,9 @@ def is_valid_dtype_n_method(dtype):
Helper function to determine if dtype is valid for
nsmallest/nlargest methods
"""
- return ((is_numeric_dtype(dtype) and not is_complex_dtype(dtype)) or
- needs_i8_conversion(dtype))
+ return (
+ is_numeric_dtype(dtype) and not is_complex_dtype(dtype)
+ ) or needs_i8_conversion(dtype)
class SelectNSeries(SelectN):
@@ -1087,9 +1155,10 @@ def compute(self, method):
n = self.n
dtype = self.obj.dtype
if not self.is_valid_dtype_n_method(dtype):
- raise TypeError("Cannot use method '{method}' with "
- "dtype {dtype}".format(method=method,
- dtype=dtype))
+ raise TypeError(
+ "Cannot use method '{method}' with "
+ "dtype {dtype}".format(method=method, dtype=dtype)
+ )
if n <= 0:
return self.obj[[]]
@@ -1099,14 +1168,14 @@ def compute(self, method):
# slow method
if n >= len(self.obj):
- reverse_it = (self.keep == 'last' or method == 'nlargest')
- ascending = method == 'nsmallest'
+ reverse_it = self.keep == "last" or method == "nlargest"
+ ascending = method == "nsmallest"
slc = np.s_[::-1] if reverse_it else np.s_[:]
return dropped[slc].sort_values(ascending=ascending).head(n)
# fast method
arr, pandas_dtype, _ = _ensure_data(dropped.values)
- if method == 'nlargest':
+ if method == "nlargest":
arr = -arr
if is_integer_dtype(pandas_dtype):
# GH 21426: ensure reverse ordering at boundaries
@@ -1116,7 +1185,7 @@ def compute(self, method):
# GH 26154: ensure False is smaller than True
arr = 1 - (-arr)
- if self.keep == 'last':
+ if self.keep == "last":
arr = arr[::-1]
narr = len(arr)
@@ -1124,12 +1193,12 @@ def compute(self, method):
kth_val = algos.kth_smallest(arr.copy(), n - 1)
ns, = np.nonzero(arr <= kth_val)
- inds = ns[arr[ns].argsort(kind='mergesort')]
+ inds = ns[arr[ns].argsort(kind="mergesort")]
- if self.keep != 'all':
+ if self.keep != "all":
inds = inds[:n]
- if self.keep == 'last':
+ if self.keep == "last":
# reverse indices
inds = narr - 1 - inds
@@ -1162,6 +1231,7 @@ def __init__(self, obj, n, keep, columns):
def compute(self, method):
from pandas import Int64Index
+
n = self.n
frame = self.obj
columns = self.columns
@@ -1169,16 +1239,18 @@ def compute(self, method):
for column in columns:
dtype = frame[column].dtype
if not self.is_valid_dtype_n_method(dtype):
- raise TypeError((
- "Column {column!r} has dtype {dtype}, cannot use method "
- "{method!r} with this dtype"
- ).format(column=column, dtype=dtype, method=method))
+ raise TypeError(
+ (
+ "Column {column!r} has dtype {dtype}, cannot use method "
+ "{method!r} with this dtype"
+ ).format(column=column, dtype=dtype, method=method)
+ )
def get_indexer(current_indexer, other_indexer):
"""Helper function to concat `current_indexer` and `other_indexer`
depending on `method`
"""
- if method == 'nsmallest':
+ if method == "nsmallest":
return current_indexer.append(other_indexer)
else:
return other_indexer.append(current_indexer)
@@ -1200,8 +1272,8 @@ def get_indexer(current_indexer, other_indexer):
series = cur_frame[column]
is_last_column = len(columns) - 1 == i
values = getattr(series, method)(
- cur_n,
- keep=self.keep if is_last_column else 'all')
+ cur_n, keep=self.keep if is_last_column else "all"
+ )
if is_last_column or len(values) <= cur_n:
indexer = get_indexer(indexer, values.index)
@@ -1234,12 +1306,9 @@ def get_indexer(current_indexer, other_indexer):
if len(columns) == 1:
return frame
- ascending = method == 'nsmallest'
+ ascending = method == "nsmallest"
- return frame.sort_values(
- columns,
- ascending=ascending,
- kind='mergesort')
+ return frame.sort_values(columns, ascending=ascending, kind="mergesort")
# ------- ## ---- #
@@ -1308,110 +1377,103 @@ def _take_nd_object(arr, indexer, out, axis, fill_value, mask_info):
_take_1d_dict = {
- ('int8', 'int8'): algos.take_1d_int8_int8,
- ('int8', 'int32'): algos.take_1d_int8_int32,
- ('int8', 'int64'): algos.take_1d_int8_int64,
- ('int8', 'float64'): algos.take_1d_int8_float64,
- ('int16', 'int16'): algos.take_1d_int16_int16,
- ('int16', 'int32'): algos.take_1d_int16_int32,
- ('int16', 'int64'): algos.take_1d_int16_int64,
- ('int16', 'float64'): algos.take_1d_int16_float64,
- ('int32', 'int32'): algos.take_1d_int32_int32,
- ('int32', 'int64'): algos.take_1d_int32_int64,
- ('int32', 'float64'): algos.take_1d_int32_float64,
- ('int64', 'int64'): algos.take_1d_int64_int64,
- ('int64', 'float64'): algos.take_1d_int64_float64,
- ('float32', 'float32'): algos.take_1d_float32_float32,
- ('float32', 'float64'): algos.take_1d_float32_float64,
- ('float64', 'float64'): algos.take_1d_float64_float64,
- ('object', 'object'): algos.take_1d_object_object,
- ('bool', 'bool'): _view_wrapper(algos.take_1d_bool_bool, np.uint8,
- np.uint8),
- ('bool', 'object'): _view_wrapper(algos.take_1d_bool_object, np.uint8,
- None),
- ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper(
- algos.take_1d_int64_int64, np.int64, np.int64, np.int64)
+ ("int8", "int8"): algos.take_1d_int8_int8,
+ ("int8", "int32"): algos.take_1d_int8_int32,
+ ("int8", "int64"): algos.take_1d_int8_int64,
+ ("int8", "float64"): algos.take_1d_int8_float64,
+ ("int16", "int16"): algos.take_1d_int16_int16,
+ ("int16", "int32"): algos.take_1d_int16_int32,
+ ("int16", "int64"): algos.take_1d_int16_int64,
+ ("int16", "float64"): algos.take_1d_int16_float64,
+ ("int32", "int32"): algos.take_1d_int32_int32,
+ ("int32", "int64"): algos.take_1d_int32_int64,
+ ("int32", "float64"): algos.take_1d_int32_float64,
+ ("int64", "int64"): algos.take_1d_int64_int64,
+ ("int64", "float64"): algos.take_1d_int64_float64,
+ ("float32", "float32"): algos.take_1d_float32_float32,
+ ("float32", "float64"): algos.take_1d_float32_float64,
+ ("float64", "float64"): algos.take_1d_float64_float64,
+ ("object", "object"): algos.take_1d_object_object,
+ ("bool", "bool"): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8),
+ ("bool", "object"): _view_wrapper(algos.take_1d_bool_object, np.uint8, None),
+ ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
+ algos.take_1d_int64_int64, np.int64, np.int64, np.int64
+ ),
}
_take_2d_axis0_dict = {
- ('int8', 'int8'): algos.take_2d_axis0_int8_int8,
- ('int8', 'int32'): algos.take_2d_axis0_int8_int32,
- ('int8', 'int64'): algos.take_2d_axis0_int8_int64,
- ('int8', 'float64'): algos.take_2d_axis0_int8_float64,
- ('int16', 'int16'): algos.take_2d_axis0_int16_int16,
- ('int16', 'int32'): algos.take_2d_axis0_int16_int32,
- ('int16', 'int64'): algos.take_2d_axis0_int16_int64,
- ('int16', 'float64'): algos.take_2d_axis0_int16_float64,
- ('int32', 'int32'): algos.take_2d_axis0_int32_int32,
- ('int32', 'int64'): algos.take_2d_axis0_int32_int64,
- ('int32', 'float64'): algos.take_2d_axis0_int32_float64,
- ('int64', 'int64'): algos.take_2d_axis0_int64_int64,
- ('int64', 'float64'): algos.take_2d_axis0_int64_float64,
- ('float32', 'float32'): algos.take_2d_axis0_float32_float32,
- ('float32', 'float64'): algos.take_2d_axis0_float32_float64,
- ('float64', 'float64'): algos.take_2d_axis0_float64_float64,
- ('object', 'object'): algos.take_2d_axis0_object_object,
- ('bool', 'bool'): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8,
- np.uint8),
- ('bool', 'object'): _view_wrapper(algos.take_2d_axis0_bool_object,
- np.uint8, None),
- ('datetime64[ns]', 'datetime64[ns]'):
- _view_wrapper(algos.take_2d_axis0_int64_int64, np.int64, np.int64,
- fill_wrap=np.int64)
+ ("int8", "int8"): algos.take_2d_axis0_int8_int8,
+ ("int8", "int32"): algos.take_2d_axis0_int8_int32,
+ ("int8", "int64"): algos.take_2d_axis0_int8_int64,
+ ("int8", "float64"): algos.take_2d_axis0_int8_float64,
+ ("int16", "int16"): algos.take_2d_axis0_int16_int16,
+ ("int16", "int32"): algos.take_2d_axis0_int16_int32,
+ ("int16", "int64"): algos.take_2d_axis0_int16_int64,
+ ("int16", "float64"): algos.take_2d_axis0_int16_float64,
+ ("int32", "int32"): algos.take_2d_axis0_int32_int32,
+ ("int32", "int64"): algos.take_2d_axis0_int32_int64,
+ ("int32", "float64"): algos.take_2d_axis0_int32_float64,
+ ("int64", "int64"): algos.take_2d_axis0_int64_int64,
+ ("int64", "float64"): algos.take_2d_axis0_int64_float64,
+ ("float32", "float32"): algos.take_2d_axis0_float32_float32,
+ ("float32", "float64"): algos.take_2d_axis0_float32_float64,
+ ("float64", "float64"): algos.take_2d_axis0_float64_float64,
+ ("object", "object"): algos.take_2d_axis0_object_object,
+ ("bool", "bool"): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8),
+ ("bool", "object"): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None),
+ ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
+ algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
+ ),
}
_take_2d_axis1_dict = {
- ('int8', 'int8'): algos.take_2d_axis1_int8_int8,
- ('int8', 'int32'): algos.take_2d_axis1_int8_int32,
- ('int8', 'int64'): algos.take_2d_axis1_int8_int64,
- ('int8', 'float64'): algos.take_2d_axis1_int8_float64,
- ('int16', 'int16'): algos.take_2d_axis1_int16_int16,
- ('int16', 'int32'): algos.take_2d_axis1_int16_int32,
- ('int16', 'int64'): algos.take_2d_axis1_int16_int64,
- ('int16', 'float64'): algos.take_2d_axis1_int16_float64,
- ('int32', 'int32'): algos.take_2d_axis1_int32_int32,
- ('int32', 'int64'): algos.take_2d_axis1_int32_int64,
- ('int32', 'float64'): algos.take_2d_axis1_int32_float64,
- ('int64', 'int64'): algos.take_2d_axis1_int64_int64,
- ('int64', 'float64'): algos.take_2d_axis1_int64_float64,
- ('float32', 'float32'): algos.take_2d_axis1_float32_float32,
- ('float32', 'float64'): algos.take_2d_axis1_float32_float64,
- ('float64', 'float64'): algos.take_2d_axis1_float64_float64,
- ('object', 'object'): algos.take_2d_axis1_object_object,
- ('bool', 'bool'): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8,
- np.uint8),
- ('bool', 'object'): _view_wrapper(algos.take_2d_axis1_bool_object,
- np.uint8, None),
- ('datetime64[ns]', 'datetime64[ns]'):
- _view_wrapper(algos.take_2d_axis1_int64_int64, np.int64, np.int64,
- fill_wrap=np.int64)
+ ("int8", "int8"): algos.take_2d_axis1_int8_int8,
+ ("int8", "int32"): algos.take_2d_axis1_int8_int32,
+ ("int8", "int64"): algos.take_2d_axis1_int8_int64,
+ ("int8", "float64"): algos.take_2d_axis1_int8_float64,
+ ("int16", "int16"): algos.take_2d_axis1_int16_int16,
+ ("int16", "int32"): algos.take_2d_axis1_int16_int32,
+ ("int16", "int64"): algos.take_2d_axis1_int16_int64,
+ ("int16", "float64"): algos.take_2d_axis1_int16_float64,
+ ("int32", "int32"): algos.take_2d_axis1_int32_int32,
+ ("int32", "int64"): algos.take_2d_axis1_int32_int64,
+ ("int32", "float64"): algos.take_2d_axis1_int32_float64,
+ ("int64", "int64"): algos.take_2d_axis1_int64_int64,
+ ("int64", "float64"): algos.take_2d_axis1_int64_float64,
+ ("float32", "float32"): algos.take_2d_axis1_float32_float32,
+ ("float32", "float64"): algos.take_2d_axis1_float32_float64,
+ ("float64", "float64"): algos.take_2d_axis1_float64_float64,
+ ("object", "object"): algos.take_2d_axis1_object_object,
+ ("bool", "bool"): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8),
+ ("bool", "object"): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None),
+ ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
+ algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
+ ),
}
_take_2d_multi_dict = {
- ('int8', 'int8'): algos.take_2d_multi_int8_int8,
- ('int8', 'int32'): algos.take_2d_multi_int8_int32,
- ('int8', 'int64'): algos.take_2d_multi_int8_int64,
- ('int8', 'float64'): algos.take_2d_multi_int8_float64,
- ('int16', 'int16'): algos.take_2d_multi_int16_int16,
- ('int16', 'int32'): algos.take_2d_multi_int16_int32,
- ('int16', 'int64'): algos.take_2d_multi_int16_int64,
- ('int16', 'float64'): algos.take_2d_multi_int16_float64,
- ('int32', 'int32'): algos.take_2d_multi_int32_int32,
- ('int32', 'int64'): algos.take_2d_multi_int32_int64,
- ('int32', 'float64'): algos.take_2d_multi_int32_float64,
- ('int64', 'int64'): algos.take_2d_multi_int64_int64,
- ('int64', 'float64'): algos.take_2d_multi_int64_float64,
- ('float32', 'float32'): algos.take_2d_multi_float32_float32,
- ('float32', 'float64'): algos.take_2d_multi_float32_float64,
- ('float64', 'float64'): algos.take_2d_multi_float64_float64,
- ('object', 'object'): algos.take_2d_multi_object_object,
- ('bool', 'bool'): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8,
- np.uint8),
- ('bool', 'object'): _view_wrapper(algos.take_2d_multi_bool_object,
- np.uint8, None),
- ('datetime64[ns]', 'datetime64[ns]'):
- _view_wrapper(algos.take_2d_multi_int64_int64, np.int64, np.int64,
- fill_wrap=np.int64)
+ ("int8", "int8"): algos.take_2d_multi_int8_int8,
+ ("int8", "int32"): algos.take_2d_multi_int8_int32,
+ ("int8", "int64"): algos.take_2d_multi_int8_int64,
+ ("int8", "float64"): algos.take_2d_multi_int8_float64,
+ ("int16", "int16"): algos.take_2d_multi_int16_int16,
+ ("int16", "int32"): algos.take_2d_multi_int16_int32,
+ ("int16", "int64"): algos.take_2d_multi_int16_int64,
+ ("int16", "float64"): algos.take_2d_multi_int16_float64,
+ ("int32", "int32"): algos.take_2d_multi_int32_int32,
+ ("int32", "int64"): algos.take_2d_multi_int32_int64,
+ ("int32", "float64"): algos.take_2d_multi_int32_float64,
+ ("int64", "int64"): algos.take_2d_multi_int64_int64,
+ ("int64", "float64"): algos.take_2d_multi_int64_float64,
+ ("float32", "float32"): algos.take_2d_multi_float32_float32,
+ ("float32", "float64"): algos.take_2d_multi_float32_float64,
+ ("float64", "float64"): algos.take_2d_multi_float64_float64,
+ ("object", "object"): algos.take_2d_multi_object_object,
+ ("bool", "bool"): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8),
+ ("bool", "object"): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None),
+ ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
+ algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
+ ),
}
@@ -1442,8 +1504,9 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None):
def func(arr, indexer, out, fill_value=np.nan):
indexer = ensure_int64(indexer)
- _take_nd_object(arr, indexer, out, axis=axis, fill_value=fill_value,
- mask_info=mask_info)
+ _take_nd_object(
+ arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
+ )
return func
@@ -1534,16 +1597,18 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None):
if allow_fill:
# Pandas style, -1 means NA
validate_indices(indices, arr.shape[axis])
- result = take_1d(arr, indices, axis=axis, allow_fill=True,
- fill_value=fill_value)
+ result = take_1d(
+ arr, indices, axis=axis, allow_fill=True, fill_value=fill_value
+ )
else:
# NumPy style
result = arr.take(indices, axis=axis)
return result
-def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
- allow_fill=True):
+def take_nd(
+ arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True
+):
"""
Specialized Cython take which sets NaN values in one pass
@@ -1618,7 +1683,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
mask_info = mask, needs_masking
if needs_masking:
if out is not None and out.dtype != dtype:
- raise TypeError('Incompatible type for fill_value')
+ raise TypeError("Incompatible type for fill_value")
else:
# if not, then depromote, set fill_value to dummy
# (it won't be used but we don't want the cython code
@@ -1647,12 +1712,13 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
# for dataframes initialized directly from 2-d ndarrays
# (s.t. df.values is c-contiguous and df._data.blocks[0] is its
# f-contiguous transpose)
- out = np.empty(out_shape, dtype=dtype, order='F')
+ out = np.empty(out_shape, dtype=dtype, order="F")
else:
out = np.empty(out_shape, dtype=dtype)
- func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,
- mask_info=mask_info)
+ func = _get_take_nd_function(
+ arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info
+ )
func(arr, indexer, out, fill_value)
if flip_order:
@@ -1663,8 +1729,9 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
take_1d = take_nd
-def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None,
- allow_fill=True):
+def take_2d_multi(
+ arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True
+):
"""
Specialized Cython take which sets NaN values in one pass
"""
@@ -1703,7 +1770,7 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None,
mask_info = (row_mask, col_mask), (row_needs, col_needs)
if row_needs or col_needs:
if out is not None and out.dtype != dtype:
- raise TypeError('Incompatible type for fill_value')
+ raise TypeError("Incompatible type for fill_value")
else:
# if not, then depromote, set fill_value to dummy
# (it won't be used but we don't want the cython code
@@ -1724,8 +1791,9 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None,
if func is None:
def func(arr, indexer, out, fill_value=np.nan):
- _take_2d_multi_object(arr, indexer, out, fill_value=fill_value,
- mask_info=mask_info)
+ _take_2d_multi_object(
+ arr, indexer, out, fill_value=fill_value, mask_info=mask_info
+ )
func(arr, indexer, out=out, fill_value=fill_value)
return out
@@ -1735,6 +1803,7 @@ def func(arr, indexer, out, fill_value=np.nan):
# searchsorted #
# ------------ #
+
def searchsorted(arr, value, side="left", sorter=None):
"""
Find indices where elements should be inserted to maintain order.
@@ -1782,9 +1851,13 @@ def searchsorted(arr, value, side="left", sorter=None):
if sorter is not None:
sorter = ensure_platform_int(sorter)
- if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and (
- is_integer(value) or is_integer_dtype(value)):
+ if (
+ isinstance(arr, np.ndarray)
+ and is_integer_dtype(arr)
+ and (is_integer(value) or is_integer_dtype(value))
+ ):
from .arrays.array_ import array
+
# if `arr` and `value` have different dtypes, `arr` would be
# recast by numpy, causing a slow search.
# Before searching below, we therefore try to give `value` the
@@ -1802,9 +1875,11 @@ def searchsorted(arr, value, side="left", sorter=None):
value = dtype.type(value)
else:
value = array(value, dtype=dtype)
- elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or
- is_categorical_dtype(arr)):
+ elif not (
+ is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr)
+ ):
from pandas.core.series import Series
+
# E.g. if `arr` is an array with dtype='datetime64[ns]'
# and `value` is a pd.Timestamp, we may need to convert value
value_ser = Series(value)._values
@@ -1819,12 +1894,12 @@ def searchsorted(arr, value, side="left", sorter=None):
# ---- #
_diff_special = {
- 'float64': algos.diff_2d_float64,
- 'float32': algos.diff_2d_float32,
- 'int64': algos.diff_2d_int64,
- 'int32': algos.diff_2d_int32,
- 'int16': algos.diff_2d_int16,
- 'int8': algos.diff_2d_int8,
+ "float64": algos.diff_2d_float64,
+ "float32": algos.diff_2d_float32,
+ "int64": algos.diff_2d_int64,
+ "int32": algos.diff_2d_int32,
+ "int16": algos.diff_2d_int16,
+ "int8": algos.diff_2d_int8,
}
@@ -1854,7 +1929,7 @@ def diff(arr, n, axis=0):
is_timedelta = False
if needs_i8_conversion(arr):
dtype = np.float64
- arr = arr.view('i8')
+ arr = arr.view("i8")
na = iNaT
is_timedelta = True
@@ -1904,7 +1979,11 @@ def diff(arr, n, axis=0):
if is_timedelta:
from pandas import TimedeltaIndex
- out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape(
- out_arr.shape).astype('timedelta64[ns]')
+
+ out_arr = (
+ TimedeltaIndex(out_arr.ravel().astype("int64"))
+ .asi8.reshape(out_arr.shape)
+ .astype("timedelta64[ns]")
+ )
return out_arr
diff --git a/pandas/core/api.py b/pandas/core/api.py
index e8d21080775da..f3ea0976a2869 100644
--- a/pandas/core/api.py
+++ b/pandas/core/api.py
@@ -23,11 +23,20 @@
from pandas.core.arrays import Categorical, array
from pandas.core.groupby import Grouper, NamedAgg
from pandas.io.formats.format import set_eng_float_format
-from pandas.core.index import (Index, CategoricalIndex, Int64Index,
- UInt64Index, RangeIndex, Float64Index,
- MultiIndex, IntervalIndex,
- TimedeltaIndex, DatetimeIndex,
- PeriodIndex, NaT)
+from pandas.core.index import (
+ Index,
+ CategoricalIndex,
+ Int64Index,
+ UInt64Index,
+ RangeIndex,
+ Float64Index,
+ MultiIndex,
+ IntervalIndex,
+ TimedeltaIndex,
+ DatetimeIndex,
+ PeriodIndex,
+ NaT,
+)
from pandas.core.indexes.period import Period, period_range
from pandas.core.indexes.timedeltas import Timedelta, timedelta_range
from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 7dc054c824fec..2246bbfde636d 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -7,16 +7,28 @@
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.common import (
- is_dict_like, is_extension_type, is_list_like, is_sequence)
+ is_dict_like,
+ is_extension_type,
+ is_list_like,
+ is_sequence,
+)
from pandas.core.dtypes.generic import ABCSeries
from pandas.io.formats.printing import pprint_thing
-def frame_apply(obj, func, axis=0, broadcast=None,
- raw=False, reduce=None, result_type=None,
- ignore_failures=False,
- args=None, kwds=None):
+def frame_apply(
+ obj,
+ func,
+ axis=0,
+ broadcast=None,
+ raw=False,
+ reduce=None,
+ result_type=None,
+ ignore_failures=False,
+ args=None,
+ kwds=None,
+):
""" construct and return a row or column based frame apply object """
axis = obj._get_axis_number(axis)
@@ -25,48 +37,71 @@ def frame_apply(obj, func, axis=0, broadcast=None,
elif axis == 1:
klass = FrameColumnApply
- return klass(obj, func, broadcast=broadcast,
- raw=raw, reduce=reduce, result_type=result_type,
- ignore_failures=ignore_failures,
- args=args, kwds=kwds)
+ return klass(
+ obj,
+ func,
+ broadcast=broadcast,
+ raw=raw,
+ reduce=reduce,
+ result_type=result_type,
+ ignore_failures=ignore_failures,
+ args=args,
+ kwds=kwds,
+ )
class FrameApply:
-
- def __init__(self, obj, func, broadcast, raw, reduce, result_type,
- ignore_failures, args, kwds):
+ def __init__(
+ self,
+ obj,
+ func,
+ broadcast,
+ raw,
+ reduce,
+ result_type,
+ ignore_failures,
+ args,
+ kwds,
+ ):
self.obj = obj
self.raw = raw
self.ignore_failures = ignore_failures
self.args = args or ()
self.kwds = kwds or {}
- if result_type not in [None, 'reduce', 'broadcast', 'expand']:
- raise ValueError("invalid value for result_type, must be one "
- "of {None, 'reduce', 'broadcast', 'expand'}")
+ if result_type not in [None, "reduce", "broadcast", "expand"]:
+ raise ValueError(
+ "invalid value for result_type, must be one "
+ "of {None, 'reduce', 'broadcast', 'expand'}"
+ )
if broadcast is not None:
- warnings.warn("The broadcast argument is deprecated and will "
- "be removed in a future version. You can specify "
- "result_type='broadcast' to broadcast the result "
- "to the original dimensions",
- FutureWarning, stacklevel=4)
+ warnings.warn(
+ "The broadcast argument is deprecated and will "
+ "be removed in a future version. You can specify "
+ "result_type='broadcast' to broadcast the result "
+ "to the original dimensions",
+ FutureWarning,
+ stacklevel=4,
+ )
if broadcast:
- result_type = 'broadcast'
+ result_type = "broadcast"
if reduce is not None:
- warnings.warn("The reduce argument is deprecated and will "
- "be removed in a future version. You can specify "
- "result_type='reduce' to try to reduce the result "
- "to the original dimensions",
- FutureWarning, stacklevel=4)
+ warnings.warn(
+ "The reduce argument is deprecated and will "
+ "be removed in a future version. You can specify "
+ "result_type='reduce' to try to reduce the result "
+ "to the original dimensions",
+ FutureWarning,
+ stacklevel=4,
+ )
if reduce:
if result_type is not None:
- raise ValueError(
- "cannot pass both reduce=True and result_type")
+ raise ValueError("cannot pass both reduce=True and result_type")
- result_type = 'reduce'
+ result_type = "reduce"
self.result_type = result_type
@@ -75,6 +110,7 @@ def __init__(self, obj, func, broadcast, raw, reduce, result_type,
def f(x):
return func(x, *args, **kwds)
+
else:
f = func
@@ -110,8 +146,7 @@ def get_result(self):
# dispatch to agg
if is_list_like(self.f) or is_dict_like(self.f):
- return self.obj.aggregate(self.f, axis=self.axis,
- *self.args, **self.kwds)
+ return self.obj.aggregate(self.f, axis=self.axis, *self.args, **self.kwds)
# all empty
if len(self.columns) == 0 and len(self.index) == 0:
@@ -124,19 +159,20 @@ def get_result(self):
# don't, so inspect and insert if necessary.
func = getattr(self.obj, self.f)
sig = inspect.getfullargspec(func)
- if 'axis' in sig.args:
- self.kwds['axis'] = self.axis
+ if "axis" in sig.args:
+ self.kwds["axis"] = self.axis
return func(*self.args, **self.kwds)
# ufunc
elif isinstance(self.f, np.ufunc):
- with np.errstate(all='ignore'):
- results = self.obj._data.apply('apply', func=self.f)
- return self.obj._constructor(data=results, index=self.index,
- columns=self.columns, copy=False)
+ with np.errstate(all="ignore"):
+ results = self.obj._data.apply("apply", func=self.f)
+ return self.obj._constructor(
+ data=results, index=self.index, columns=self.columns, copy=False
+ )
# broadcasting
- if self.result_type == 'broadcast':
+ if self.result_type == "broadcast":
return self.apply_broadcast()
# one axis empty
@@ -159,13 +195,14 @@ def apply_empty_result(self):
# we are not asked to reduce or infer reduction
# so just return a copy of the existing object
- if self.result_type not in ['reduce', None]:
+ if self.result_type not in ["reduce", None]:
return self.obj.copy()
# we may need to infer
- reduce = self.result_type == 'reduce'
+ reduce = self.result_type == "reduce"
from pandas import Series
+
if not reduce:
EMPTY_SERIES = Series([])
@@ -190,12 +227,9 @@ def apply_raw(self):
# TODO: mixed type case
if result.ndim == 2:
- return self.obj._constructor(result,
- index=self.index,
- columns=self.columns)
+ return self.obj._constructor(result, index=self.index, columns=self.columns)
else:
- return self.obj._constructor_sliced(result,
- index=self.agg_axis)
+ return self.obj._constructor_sliced(result, index=self.agg_axis)
def apply_broadcast(self, target):
result_values = np.empty_like(target.values)
@@ -219,9 +253,9 @@ def apply_broadcast(self, target):
result_values[:, i] = res
# we *always* preserve the original index / columns
- result = self.obj._constructor(result_values,
- index=target.index,
- columns=target.columns)
+ result = self.obj._constructor(
+ result_values, index=target.index, columns=target.columns
+ )
return result
def apply_standard(self):
@@ -232,11 +266,14 @@ def apply_standard(self):
# we cannot reduce using non-numpy dtypes,
# as demonstrated in gh-12244
- if (self.result_type in ['reduce', None] and
- not self.dtypes.apply(is_extension_type).any()):
+ if (
+ self.result_type in ["reduce", None]
+ and not self.dtypes.apply(is_extension_type).any()
+ ):
# Create a dummy Series from an empty array
from pandas import Series
+
values = self.values
index = self.obj._get_axis(self.axis)
labels = self.agg_axis
@@ -244,10 +281,9 @@ def apply_standard(self):
dummy = Series(empty_arr, index=index, dtype=values.dtype)
try:
- result = reduction.reduce(values, self.f,
- axis=self.axis,
- dummy=dummy,
- labels=labels)
+ result = reduction.reduce(
+ values, self.f, axis=self.axis, dummy=dummy, labels=labels
+ )
return self.obj._constructor_sliced(result, index=labels)
except Exception:
pass
@@ -285,13 +321,12 @@ def apply_series_generator(self):
results[i] = self.f(v)
keys.append(v.name)
except Exception as e:
- if hasattr(e, 'args'):
+ if hasattr(e, "args"):
# make sure i is defined
if i is not None:
k = res_index[i]
- e.args = e.args + ('occurred at index %s' %
- pprint_thing(k), )
+ e.args = e.args + ("occurred at index %s" % pprint_thing(k),)
raise
self.results = results
@@ -321,8 +356,7 @@ def apply_broadcast(self):
@property
def series_generator(self):
- return (self.obj._ixs(i, axis=1)
- for i in range(len(self.columns)))
+ return (self.obj._ixs(i, axis=1) for i in range(len(self.columns)))
@property
def result_index(self):
@@ -362,9 +396,10 @@ def apply_broadcast(self):
@property
def series_generator(self):
constructor = self.obj._constructor_sliced
- return (constructor(arr, index=self.columns, name=name)
- for i, (arr, name) in enumerate(zip(self.values,
- self.index)))
+ return (
+ constructor(arr, index=self.columns, name=name)
+ for i, (arr, name) in enumerate(zip(self.values, self.index))
+ )
@property
def result_index(self):
@@ -379,12 +414,13 @@ def wrap_results_for_axis(self):
results = self.results
# we have requested to expand
- if self.result_type == 'expand':
+ if self.result_type == "expand":
result = self.infer_to_same_shape()
# we have a non-series and don't want inference
elif not isinstance(results[0], ABCSeries):
from pandas import Series
+
result = Series(results)
result.index = self.res_index
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
index 2d09a9eac6eab..dab29e9ce71d3 100644
--- a/pandas/core/arrays/__init__.py
+++ b/pandas/core/arrays/__init__.py
@@ -1,6 +1,9 @@
from .array_ import array # noqa: F401
from .base import ( # noqa: F401
- ExtensionArray, ExtensionOpsMixin, ExtensionScalarOpsMixin)
+ ExtensionArray,
+ ExtensionOpsMixin,
+ ExtensionScalarOpsMixin,
+)
from .categorical import Categorical # noqa: F401
from .datetimes import DatetimeArray # noqa: F401
from .integer import IntegerArray, integer_array # noqa: F401
diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py
index 7a83b7960a6e7..15ff1432f16e2 100644
--- a/pandas/core/arrays/_ranges.py
+++ b/pandas/core/arrays/_ranges.py
@@ -12,10 +12,9 @@
from pandas.tseries.offsets import DateOffset, Tick, generate_range
-def generate_regular_range(start: Timestamp,
- end: Timestamp,
- periods: int,
- freq: DateOffset) -> Tuple[np.ndarray, str]:
+def generate_regular_range(
+ start: Timestamp, end: Timestamp, periods: int, freq: DateOffset
+) -> Tuple[np.ndarray, str]:
"""
Generate a range of dates with the spans between dates described by
the given `freq` DateOffset.
@@ -41,21 +40,22 @@ def generate_regular_range(start: Timestamp,
b = Timestamp(start).value
# cannot just use e = Timestamp(end) + 1 because arange breaks when
# stride is too large, see GH10887
- e = (b + (Timestamp(end).value - b) // stride * stride +
- stride // 2 + 1)
+ e = b + (Timestamp(end).value - b) // stride * stride + stride // 2 + 1
# end.tz == start.tz by this point due to _generate implementation
tz = start.tz
elif start is not None:
b = Timestamp(start).value
- e = _generate_range_overflow_safe(b, periods, stride, side='start')
+ e = _generate_range_overflow_safe(b, periods, stride, side="start")
tz = start.tz
elif end is not None:
e = Timestamp(end).value + stride
- b = _generate_range_overflow_safe(e, periods, stride, side='end')
+ b = _generate_range_overflow_safe(e, periods, stride, side="end")
tz = end.tz
else:
- raise ValueError("at least 'start' or 'end' should be specified "
- "if a 'period' is given.")
+ raise ValueError(
+ "at least 'start' or 'end' should be specified "
+ "if a 'period' is given."
+ )
with np.errstate(over="raise"):
# If the range is sufficiently large, np.arange may overflow
@@ -76,18 +76,16 @@ def generate_regular_range(start: Timestamp,
elif end is not None:
tz = end.tz
- xdr = generate_range(start=start, end=end,
- periods=periods, offset=freq)
+ xdr = generate_range(start=start, end=end, periods=periods, offset=freq)
values = np.array([x.value for x in xdr], dtype=np.int64)
return values, tz
-def _generate_range_overflow_safe(endpoint: int,
- periods: int,
- stride: int,
- side: str = 'start') -> int:
+def _generate_range_overflow_safe(
+ endpoint: int, periods: int, stride: int, side: str = "start"
+) -> int:
"""
Calculate the second endpoint for passing to np.arange, checking
to avoid an integer overflow. Catch OverflowError and re-raise
@@ -113,12 +111,13 @@ def _generate_range_overflow_safe(endpoint: int,
OutOfBoundsDatetime
"""
# GH#14187 raise instead of incorrectly wrapping around
- assert side in ['start', 'end']
+ assert side in ["start", "end"]
i64max = np.uint64(np.iinfo(np.int64).max)
- msg = ('Cannot generate range with {side}={endpoint} and '
- 'periods={periods}'
- .format(side=side, endpoint=endpoint, periods=periods))
+ msg = (
+ "Cannot generate range with {side}={endpoint} and "
+ "periods={periods}".format(side=side, endpoint=endpoint, periods=periods)
+ )
with np.errstate(over="raise"):
# if periods * strides cannot be multiplied within the *uint64* bounds,
@@ -130,40 +129,39 @@ def _generate_range_overflow_safe(endpoint: int,
if np.abs(addend) <= i64max:
# relatively easy case without casting concerns
- return _generate_range_overflow_safe_signed(
- endpoint, periods, stride, side)
+ return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
- elif ((endpoint > 0 and side == 'start' and stride > 0) or
- (endpoint < 0 and side == 'end' and stride > 0)):
+ elif (endpoint > 0 and side == "start" and stride > 0) or (
+ endpoint < 0 and side == "end" and stride > 0
+ ):
# no chance of not-overflowing
raise OutOfBoundsDatetime(msg)
- elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max):
+ elif side == "end" and endpoint > i64max and endpoint - stride <= i64max:
# in _generate_regular_range we added `stride` thereby overflowing
# the bounds. Adjust to fix this.
- return _generate_range_overflow_safe(endpoint - stride,
- periods - 1, stride, side)
+ return _generate_range_overflow_safe(
+ endpoint - stride, periods - 1, stride, side
+ )
# split into smaller pieces
mid_periods = periods // 2
remaining = periods - mid_periods
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
- midpoint = _generate_range_overflow_safe(endpoint, mid_periods,
- stride, side)
+ midpoint = _generate_range_overflow_safe(endpoint, mid_periods, stride, side)
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
-def _generate_range_overflow_safe_signed(endpoint: int,
- periods: int,
- stride: int,
- side: str) -> int:
+def _generate_range_overflow_safe_signed(
+ endpoint: int, periods: int, stride: int, side: str
+) -> int:
"""
A special case for _generate_range_overflow_safe where `periods * stride`
can be calculated without overflowing int64 bounds.
"""
- assert side in ['start', 'end']
- if side == 'end':
+ assert side in ["start", "end"]
+ if side == "end":
stride *= -1
with np.errstate(over="raise"):
@@ -191,8 +189,8 @@ def _generate_range_overflow_safe_signed(endpoint: int,
if result <= i64max + np.uint64(stride):
return result
- raise OutOfBoundsDatetime('Cannot generate range with '
- '{side}={endpoint} and '
- 'periods={periods}'
- .format(side=side, endpoint=endpoint,
- periods=periods))
+ raise OutOfBoundsDatetime(
+ "Cannot generate range with "
+ "{side}={endpoint} and "
+ "periods={periods}".format(side=side, endpoint=endpoint, periods=periods)
+ )
diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py
index 1b002ad12d526..93ee570c1f971 100644
--- a/pandas/core/arrays/array_.py
+++ b/pandas/core/arrays/array_.py
@@ -5,15 +5,19 @@
from pandas._libs import lib, tslibs
from pandas.core.dtypes.common import (
- is_datetime64_ns_dtype, is_extension_array_dtype, is_timedelta64_ns_dtype)
+ is_datetime64_ns_dtype,
+ is_extension_array_dtype,
+ is_timedelta64_ns_dtype,
+)
from pandas.core.dtypes.dtypes import ExtensionDtype, registry
from pandas.core.dtypes.generic import ABCExtensionArray
-def array(data: Sequence[object],
- dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None,
- copy: bool = True,
- ) -> ABCExtensionArray:
+def array(
+ data: Sequence[object],
+ dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None,
+ copy: bool = True,
+) -> ABCExtensionArray:
"""
Create an array.
@@ -207,16 +211,17 @@ def array(data: Sequence[object],
ValueError: Cannot pass scalar '1' to 'pandas.array'.
"""
from pandas.core.arrays import (
- period_array, ExtensionArray, IntervalArray, PandasArray,
+ period_array,
+ ExtensionArray,
+ IntervalArray,
+ PandasArray,
DatetimeArray,
TimedeltaArray,
)
from pandas.core.internals.arrays import extract_array
if lib.is_scalar(data):
- msg = (
- "Cannot pass scalar '{}' to 'pandas.array'."
- )
+ msg = "Cannot pass scalar '{}' to 'pandas.array'."
raise ValueError(msg.format(data))
data = extract_array(data, extract_numpy=True)
@@ -234,14 +239,14 @@ def array(data: Sequence[object],
if dtype is None:
inferred_dtype = lib.infer_dtype(data, skipna=False)
- if inferred_dtype == 'period':
+ if inferred_dtype == "period":
try:
return period_array(data, copy=copy)
except tslibs.IncompatibleFrequency:
# We may have a mixture of frequencies.
# We choose to return an ndarray, rather than raising.
pass
- elif inferred_dtype == 'interval':
+ elif inferred_dtype == "interval":
try:
return IntervalArray(data, copy=copy)
except ValueError:
@@ -249,7 +254,7 @@ def array(data: Sequence[object],
# We choose to return an ndarray, rather than raising.
pass
- elif inferred_dtype.startswith('datetime'):
+ elif inferred_dtype.startswith("datetime"):
# datetime, datetime64
try:
return DatetimeArray._from_sequence(data, copy=copy)
@@ -257,7 +262,7 @@ def array(data: Sequence[object],
# Mixture of timezones, fall back to PandasArray
pass
- elif inferred_dtype.startswith('timedelta'):
+ elif inferred_dtype.startswith("timedelta"):
# timedelta, timedelta64
return TimedeltaArray._from_sequence(data, copy=copy)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 21f0f3c08e93b..2a5556ff6d357 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -17,8 +17,7 @@
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.dtypes import ExtensionDtype
-from pandas.core.dtypes.generic import (
- ABCExtensionArray, ABCIndexClass, ABCSeries)
+from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna
from pandas._typing import ArrayLike
@@ -120,9 +119,10 @@ class ExtensionArray:
See :ref:`extending.extension.ufunc` for more.
"""
+
# '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray.
# Don't override this.
- _typ = 'extension'
+ _typ = "extension"
# ------------------------------------------------------------------------
# Constructors
@@ -272,8 +272,8 @@ def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None:
# __init__ method coerces that value, then so should __setitem__
# Note, also, that Series/DataFrame.where internally use __setitem__
# on a copy of the data.
- raise NotImplementedError(_not_implemented_message.format(
- type(self), '__setitem__')
+ raise NotImplementedError(
+ _not_implemented_message.format(type(self), "__setitem__")
)
def __len__(self) -> int:
@@ -393,7 +393,7 @@ def _values_for_argsort(self) -> np.ndarray:
# Note: this is used in `ExtensionArray.argsort`.
return np.array(self)
- def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
+ def argsort(self, ascending=True, kind="quicksort", *args, **kwargs):
"""
Return the indices that would sort this array.
@@ -423,8 +423,7 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
# 2. argsort : total control over sorting.
ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs)
- result = nargsort(self, kind=kind, ascending=ascending,
- na_position='last')
+ result = nargsort(self, kind=kind, ascending=ascending, na_position="last")
return result
def fillna(self, value=None, method=None, limit=None):
@@ -463,15 +462,16 @@ def fillna(self, value=None, method=None, limit=None):
if is_array_like(value):
if len(value) != len(self):
- raise ValueError("Length of 'value' does not match. Got ({}) "
- " expected {}".format(len(value), len(self)))
+ raise ValueError(
+ "Length of 'value' does not match. Got ({}) "
+ " expected {}".format(len(value), len(self))
+ )
value = value[mask]
if mask.any():
if method is not None:
- func = pad_1d if method == 'pad' else backfill_1d
- new_values = func(self.astype(object), limit=limit,
- mask=mask)
+ func = pad_1d if method == "pad" else backfill_1d
+ new_values = func(self.astype(object), limit=limit, mask=mask)
new_values = self._from_sequence(new_values, dtype=self.dtype)
else:
# fill with value
@@ -491,10 +491,7 @@ def dropna(self):
"""
return self[~self.isna()]
- def shift(
- self,
- periods: int = 1,
- fill_value: object = None) -> ABCExtensionArray:
+ def shift(self, periods: int = 1, fill_value: object = None) -> ABCExtensionArray:
"""
Shift values by desired number.
@@ -537,14 +534,13 @@ def shift(
fill_value = self.dtype.na_value
empty = self._from_sequence(
- [fill_value] * min(abs(periods), len(self)),
- dtype=self.dtype
+ [fill_value] * min(abs(periods), len(self)), dtype=self.dtype
)
if periods > 0:
a = empty
b = self[:-periods]
else:
- a = self[abs(periods):]
+ a = self[abs(periods) :]
b = empty
return self._concat_same_type([a, b])
@@ -633,10 +629,7 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
"""
return self.astype(object), np.nan
- def factorize(
- self,
- na_sentinel: int = -1,
- ) -> Tuple[np.ndarray, ABCExtensionArray]:
+ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArray]:
"""
Encode the extension array as an enumerated type.
@@ -679,13 +672,16 @@ def factorize(
arr, na_value = self._values_for_factorize()
- labels, uniques = _factorize_array(arr, na_sentinel=na_sentinel,
- na_value=na_value)
+ labels, uniques = _factorize_array(
+ arr, na_sentinel=na_sentinel, na_value=na_value
+ )
uniques = self._from_factorized(uniques, self)
return labels, uniques
- _extension_array_shared_docs['repeat'] = """
+ _extension_array_shared_docs[
+ "repeat"
+ ] = """
Repeat elements of a %(klass)s.
Returns a new %(klass)s where each element of the current %(klass)s
@@ -727,8 +723,8 @@ def factorize(
Categories (3, object): [a, b, c]
"""
- @Substitution(klass='ExtensionArray')
- @Appender(_extension_array_shared_docs['repeat'])
+ @Substitution(klass="ExtensionArray")
+ @Appender(_extension_array_shared_docs["repeat"])
def repeat(self, repeats, axis=None):
nv.validate_repeat(tuple(), dict(axis=axis))
ind = np.arange(len(self)).repeat(repeats)
@@ -739,10 +735,7 @@ def repeat(self, repeats, axis=None):
# ------------------------------------------------------------------------
def take(
- self,
- indices: Sequence[int],
- allow_fill: bool = False,
- fill_value: Any = None
+ self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None
) -> ABCExtensionArray:
"""
Take elements from an array.
@@ -849,25 +842,19 @@ def copy(self) -> ABCExtensionArray:
def __repr__(self):
from pandas.io.formats.printing import format_object_summary
- template = (
- '{class_name}'
- '{data}\n'
- 'Length: {length}, dtype: {dtype}'
- )
+ template = "{class_name}" "{data}\n" "Length: {length}, dtype: {dtype}"
# the short repr has no trailing newline, while the truncated
# repr does. So we include a newline in our template, and strip
# any trailing newlines from format_object_summary
- data = format_object_summary(self, self._formatter(),
- indent_for_name=False).rstrip(', \n')
- class_name = '<{}>\n'.format(self.__class__.__name__)
- return template.format(class_name=class_name, data=data,
- length=len(self),
- dtype=self.dtype)
-
- def _formatter(
- self,
- boxed: bool = False,
- ) -> Callable[[Any], Optional[str]]:
+ data = format_object_summary(
+ self, self._formatter(), indent_for_name=False
+ ).rstrip(", \n")
+ class_name = "<{}>\n".format(self.__class__.__name__)
+ return template.format(
+ class_name=class_name, data=data, length=len(self), dtype=self.dtype
+ )
+
+ def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]:
"""Formatting function for scalar values.
This is used in the default '__repr__'. The returned formatting
@@ -926,8 +913,7 @@ def ravel(self, order="C") -> ABCExtensionArray:
@classmethod
def _concat_same_type(
- cls,
- to_concat: Sequence[ABCExtensionArray]
+ cls, to_concat: Sequence[ABCExtensionArray]
) -> ABCExtensionArray:
"""
Concatenate multiple array
@@ -985,8 +971,11 @@ def _reduce(self, name, skipna=True, **kwargs):
------
TypeError : subclass does not define reductions
"""
- raise TypeError("cannot perform {name} with type {dtype}".format(
- name=name, dtype=self.dtype))
+ raise TypeError(
+ "cannot perform {name} with type {dtype}".format(
+ name=name, dtype=self.dtype
+ )
+ )
class ExtensionOpsMixin:
@@ -1127,7 +1116,7 @@ def _maybe_convert(arr):
res = np.asarray(arr)
return res
- if op.__name__ in {'divmod', 'rdivmod'}:
+ if op.__name__ in {"divmod", "rdivmod"}:
a, b = zip(*res)
res = _maybe_convert(a), _maybe_convert(b)
else:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 5ae71ffb165e9..c4f7d6dbe32fa 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -9,20 +9,41 @@
from pandas._libs import algos as libalgos, lib
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (
- Appender, Substitution, cache_readonly, deprecate_kwarg)
+ Appender,
+ Substitution,
+ cache_readonly,
+ deprecate_kwarg,
+)
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
-from pandas.core.dtypes.cast import (
- coerce_indexer_dtype, maybe_infer_to_datetimelike)
+from pandas.core.dtypes.cast import coerce_indexer_dtype, maybe_infer_to_datetimelike
from pandas.core.dtypes.common import (
- ensure_int64, ensure_object, ensure_platform_int, is_categorical,
- is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
- is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype,
- is_iterator, is_list_like, is_object_dtype, is_scalar, is_sequence,
- is_timedelta64_dtype)
+ ensure_int64,
+ ensure_object,
+ ensure_platform_int,
+ is_categorical,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetimelike,
+ is_dict_like,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_float_dtype,
+ is_integer_dtype,
+ is_iterator,
+ is_list_like,
+ is_object_dtype,
+ is_scalar,
+ is_sequence,
+ is_timedelta64_dtype,
+)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.generic import (
- ABCCategoricalIndex, ABCDataFrame, ABCIndexClass, ABCSeries)
+ ABCCategoricalIndex,
+ ABCDataFrame,
+ ABCIndexClass,
+ ABCSeries,
+)
from pandas.core.dtypes.inference import is_hashable
from pandas.core.dtypes.missing import isna, notna
@@ -39,7 +60,8 @@
from .base import ExtensionArray, _extension_array_shared_docs
-_take_msg = textwrap.dedent("""\
+_take_msg = textwrap.dedent(
+ """\
Interpreting negative values in 'indexer' as missing values.
In the future, this will change to meaning positional indices
from the right.
@@ -47,7 +69,8 @@
Use 'allow_fill=True' to retain the previous behavior and silence this
warning.
- Use 'allow_fill=False' to accept the new behavior.""")
+ Use 'allow_fill=False' to accept the new behavior."""
+)
def _cat_compare_op(op):
@@ -63,28 +86,27 @@ def f(self, other):
other = lib.item_from_zerodim(other)
if not self.ordered:
- if op in ['__lt__', '__gt__', '__le__', '__ge__']:
- raise TypeError("Unordered Categoricals can only compare "
- "equality or not")
+ if op in ["__lt__", "__gt__", "__le__", "__ge__"]:
+ raise TypeError(
+ "Unordered Categoricals can only compare " "equality or not"
+ )
if isinstance(other, Categorical):
# Two Categoricals can only be be compared if the categories are
# the same (maybe up to ordering, depending on ordered)
- msg = ("Categoricals can only be compared if "
- "'categories' are the same.")
+ msg = "Categoricals can only be compared if " "'categories' are the same."
if len(self.categories) != len(other.categories):
raise TypeError(msg + " Categories are different lengths")
- elif (self.ordered and not (self.categories ==
- other.categories).all()):
+ elif self.ordered and not (self.categories == other.categories).all():
raise TypeError(msg)
elif not set(self.categories) == set(other.categories):
raise TypeError(msg)
if not (self.ordered == other.ordered):
- raise TypeError("Categoricals can only be compared if "
- "'ordered' is the same")
- if not self.ordered and not self.categories.equals(
- other.categories):
+ raise TypeError(
+ "Categoricals can only be compared if " "'ordered' is the same"
+ )
+ if not self.ordered and not self.categories.equals(other.categories):
# both unordered and different order
other_codes = _get_codes_for_values(other, self.categories)
else:
@@ -104,28 +126,32 @@ def f(self, other):
ret = getattr(self._codes, op)(i)
# check for NaN in self
- mask = (self._codes == -1)
+ mask = self._codes == -1
ret[mask] = False
return ret
else:
- if op == '__eq__':
+ if op == "__eq__":
return np.repeat(False, len(self))
- elif op == '__ne__':
+ elif op == "__ne__":
return np.repeat(True, len(self))
else:
- msg = ("Cannot compare a Categorical for op {op} with a "
- "scalar, which is not a category.")
+ msg = (
+ "Cannot compare a Categorical for op {op} with a "
+ "scalar, which is not a category."
+ )
raise TypeError(msg.format(op=op))
else:
# allow categorical vs object dtype array comparisons for equality
# these are only positional comparisons
- if op in ['__eq__', '__ne__']:
+ if op in ["__eq__", "__ne__"]:
return getattr(np.array(self), op)(np.array(other))
- msg = ("Cannot compare a Categorical for op {op} with type {typ}."
- "\nIf you want to compare values, use 'np.asarray(cat) "
- " other'.")
+ msg = (
+ "Cannot compare a Categorical for op {op} with type {typ}."
+ "\nIf you want to compare values, use 'np.asarray(cat) "
+ " other'."
+ )
raise TypeError(msg.format(op=op, typ=type(other)))
f.__name__ = op
@@ -308,14 +334,16 @@ class Categorical(ExtensionArray, PandasObject):
__array_priority__ = 1000
_dtype = CategoricalDtype(ordered=False)
# tolist is not actually deprecated, just suppressed in the __dir__
- _deprecations = frozenset(['labels', 'tolist'])
- _typ = 'categorical'
+ _deprecations = frozenset(["labels", "tolist"])
+ _typ = "categorical"
- def __init__(self, values, categories=None, ordered=None, dtype=None,
- fastpath=False):
+ def __init__(
+ self, values, categories=None, ordered=None, dtype=None, fastpath=False
+ ):
- dtype = CategoricalDtype._from_values_or_dtype(values, categories,
- ordered, dtype)
+ dtype = CategoricalDtype._from_values_or_dtype(
+ values, categories, ordered, dtype
+ )
# At this point, dtype is always a CategoricalDtype, but
# we may have dtype.categories be None, and we need to
# infer categories in a factorization step futher below
@@ -340,9 +368,10 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
if not isinstance(values, np.ndarray):
values = _convert_to_list_like(values)
from pandas.core.internals.construction import sanitize_array
+
# By convention, empty lists result in object dtype:
if len(values) == 0:
- sanitize_dtype = 'object'
+ sanitize_dtype = "object"
else:
sanitize_dtype = None
null_mask = isna(values)
@@ -358,30 +387,35 @@ def __init__(self, values, categories=None, ordered=None, dtype=None,
if dtype._ordered:
# raise, as we don't have a sortable data structure and so
# the user should give us one by specifying categories
- raise TypeError("'values' is not ordered, please "
- "explicitly specify the categories order "
- "by passing in a categories argument.")
+ raise TypeError(
+ "'values' is not ordered, please "
+ "explicitly specify the categories order "
+ "by passing in a categories argument."
+ )
except ValueError:
# FIXME
- raise NotImplementedError("> 1 ndim Categorical are not "
- "supported at this time")
+ raise NotImplementedError(
+ "> 1 ndim Categorical are not " "supported at this time"
+ )
# we're inferring from values
dtype = CategoricalDtype(categories, dtype._ordered)
elif is_categorical_dtype(values):
- old_codes = (values._values.codes if isinstance(values, ABCSeries)
- else values.codes)
- codes = _recode_for_categories(old_codes, values.dtype.categories,
- dtype.categories)
+ old_codes = (
+ values._values.codes if isinstance(values, ABCSeries) else values.codes
+ )
+ codes = _recode_for_categories(
+ old_codes, values.dtype.categories, dtype.categories
+ )
else:
codes = _get_codes_for_values(values, dtype.categories)
if null_mask.any():
# Reinsert -1 placeholders for previously removed missing values
- full_codes = - np.ones(null_mask.shape, dtype=codes.dtype)
+ full_codes = -np.ones(null_mask.shape, dtype=codes.dtype)
full_codes[~null_mask] = codes
codes = full_codes
@@ -422,10 +456,13 @@ def categories(self):
@categories.setter
def categories(self, categories):
new_dtype = CategoricalDtype(categories, ordered=self.ordered)
- if (self.dtype.categories is not None and
- len(self.dtype.categories) != len(new_dtype.categories)):
- raise ValueError("new categories need to have the same number of "
- "items as the old categories!")
+ if self.dtype.categories is not None and len(self.dtype.categories) != len(
+ new_dtype.categories
+ ):
+ raise ValueError(
+ "new categories need to have the same number of "
+ "items as the old categories!"
+ )
self._dtype = new_dtype
@property
@@ -462,9 +499,9 @@ def copy(self):
"""
Copy constructor.
"""
- return self._constructor(values=self._codes.copy(),
- dtype=self.dtype,
- fastpath=True)
+ return self._constructor(
+ values=self._codes.copy(), dtype=self.dtype, fastpath=True
+ )
def astype(self, dtype, copy=True):
"""
@@ -531,8 +568,9 @@ def base(self):
return None
@classmethod
- def _from_inferred_categories(cls, inferred_categories, inferred_codes,
- dtype, true_values=None):
+ def _from_inferred_categories(
+ cls, inferred_categories, inferred_codes, dtype, true_values=None
+ ):
"""
Construct a Categorical from inferred values.
@@ -556,8 +594,9 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
from pandas import Index, to_numeric, to_datetime, to_timedelta
cats = Index(inferred_categories)
- known_categories = (isinstance(dtype, CategoricalDtype) and
- dtype.categories is not None)
+ known_categories = (
+ isinstance(dtype, CategoricalDtype) and dtype.categories is not None
+ )
if known_categories:
# Convert to a specialized type with `dtype` if specified.
@@ -582,8 +621,7 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
unsorted = cats.copy()
categories = cats.sort_values()
- codes = _recode_for_categories(inferred_codes, unsorted,
- categories)
+ codes = _recode_for_categories(inferred_codes, unsorted, categories)
dtype = CategoricalDtype(categories, ordered=False)
else:
dtype = CategoricalDtype(cats, ordered=False)
@@ -636,31 +674,37 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
[a, b, a, b]
Categories (2, object): [a < b]
"""
- dtype = CategoricalDtype._from_values_or_dtype(categories=categories,
- ordered=ordered,
- dtype=dtype)
+ dtype = CategoricalDtype._from_values_or_dtype(
+ categories=categories, ordered=ordered, dtype=dtype
+ )
if dtype.categories is None:
- msg = ("The categories must be provided in 'categories' or "
- "'dtype'. Both were None.")
+ msg = (
+ "The categories must be provided in 'categories' or "
+ "'dtype'. Both were None."
+ )
raise ValueError(msg)
codes = np.asarray(codes) # #21767
if not is_integer_dtype(codes):
msg = "codes need to be array-like integers"
if is_float_dtype(codes):
- icodes = codes.astype('i8')
+ icodes = codes.astype("i8")
if (icodes == codes).all():
msg = None
codes = icodes
- warn(("float codes will be disallowed in the future and "
- "raise a ValueError"), FutureWarning, stacklevel=2)
+ warn(
+ (
+ "float codes will be disallowed in the future and "
+ "raise a ValueError"
+ ),
+ FutureWarning,
+ stacklevel=2,
+ )
if msg:
raise ValueError(msg)
- if len(codes) and (
- codes.max() >= len(dtype.categories) or codes.min() < -1):
- raise ValueError("codes need to be between -1 and "
- "len(categories)-1")
+ if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
+ raise ValueError("codes need to be between -1 and " "len(categories)-1")
return cls(codes, dtype=dtype, fastpath=True)
@@ -710,14 +754,18 @@ def _set_categories(self, categories, fastpath=False):
"""
if fastpath:
- new_dtype = CategoricalDtype._from_fastpath(categories,
- self.ordered)
+ new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)
else:
new_dtype = CategoricalDtype(categories, ordered=self.ordered)
- if (not fastpath and self.dtype.categories is not None and
- len(new_dtype.categories) != len(self.dtype.categories)):
- raise ValueError("new categories need to have the same number of "
- "items than the old categories!")
+ if (
+ not fastpath
+ and self.dtype.categories is not None
+ and len(new_dtype.categories) != len(self.dtype.categories)
+ ):
+ raise ValueError(
+ "new categories need to have the same number of "
+ "items than the old categories!"
+ )
self._dtype = new_dtype
@@ -734,8 +782,7 @@ def _set_dtype(self, dtype):
We don't do any validation here. It's assumed that the dtype is
a (valid) instance of `CategoricalDtype`.
"""
- codes = _recode_for_categories(self.codes, self.categories,
- dtype.categories)
+ codes = _recode_for_categories(self.codes, self.categories, dtype.categories)
return type(self)(codes, dtype=dtype, fastpath=True)
def set_ordered(self, value, inplace=False):
@@ -750,7 +797,7 @@ def set_ordered(self, value, inplace=False):
Whether or not to set the ordered attribute in-place or return
a copy of this categorical with ordered set to the value.
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
new_dtype = CategoricalDtype(self.categories, ordered=value)
cat = self if inplace else self.copy()
cat._dtype = new_dtype
@@ -772,7 +819,7 @@ def as_ordered(self, inplace=False):
Categorical
Ordered Categorical.
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
return self.set_ordered(True, inplace=inplace)
def as_unordered(self, inplace=False):
@@ -790,11 +837,10 @@ def as_unordered(self, inplace=False):
Categorical
Unordered Categorical.
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
return self.set_ordered(False, inplace=inplace)
- def set_categories(self, new_categories, ordered=None, rename=False,
- inplace=False):
+ def set_categories(self, new_categories, ordered=None, rename=False, inplace=False):
"""
Set the categories to the specified new_categories.
@@ -845,20 +891,22 @@ def set_categories(self, new_categories, ordered=None, rename=False,
remove_categories
remove_unused_categories
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if ordered is None:
ordered = self.dtype._ordered
new_dtype = CategoricalDtype(new_categories, ordered=ordered)
cat = self if inplace else self.copy()
if rename:
- if (cat.dtype.categories is not None and
- len(new_dtype.categories) < len(cat.dtype.categories)):
+ if cat.dtype.categories is not None and len(new_dtype.categories) < len(
+ cat.dtype.categories
+ ):
# remove all _codes which are larger and set to -1/NaN
cat._codes[cat._codes >= len(new_dtype.categories)] = -1
else:
- codes = _recode_for_categories(cat.codes, cat.categories,
- new_dtype.categories)
+ codes = _recode_for_categories(
+ cat.codes, cat.categories, new_dtype.categories
+ )
cat._codes = codes
cat._dtype = new_dtype
@@ -932,12 +980,11 @@ def rename_categories(self, new_categories, inplace=False):
[A, A, B]
Categories (2, object): [A, B]
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
cat = self if inplace else self.copy()
if is_dict_like(new_categories):
- cat.categories = [new_categories.get(item, item)
- for item in cat.categories]
+ cat.categories = [new_categories.get(item, item) for item in cat.categories]
elif callable(new_categories):
cat.categories = [new_categories(item) for item in cat.categories]
else:
@@ -981,12 +1028,12 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False):
remove_unused_categories
set_categories
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if set(self.dtype.categories) != set(new_categories):
- raise ValueError("items in new_categories are not the same as in "
- "old categories")
- return self.set_categories(new_categories, ordered=ordered,
- inplace=inplace)
+ raise ValueError(
+ "items in new_categories are not the same as in " "old categories"
+ )
+ return self.set_categories(new_categories, ordered=ordered, inplace=inplace)
def add_categories(self, new_categories, inplace=False):
"""
@@ -1021,13 +1068,15 @@ def add_categories(self, new_categories, inplace=False):
remove_unused_categories
set_categories
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if not is_list_like(new_categories):
new_categories = [new_categories]
already_included = set(new_categories) & set(self.dtype.categories)
if len(already_included) != 0:
- msg = ("new categories must not include old categories: "
- "{already_included!s}")
+ msg = (
+ "new categories must not include old categories: "
+ "{already_included!s}"
+ )
raise ValueError(msg.format(already_included=already_included))
new_categories = list(self.dtype.categories) + list(new_categories)
new_dtype = CategoricalDtype(new_categories, self.ordered)
@@ -1070,14 +1119,13 @@ def remove_categories(self, removals, inplace=False):
remove_unused_categories
set_categories
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if not is_list_like(removals):
removals = [removals]
removal_set = set(list(removals))
not_included = removal_set - set(self.dtype.categories)
- new_categories = [c for c in self.dtype.categories
- if c not in removal_set]
+ new_categories = [c for c in self.dtype.categories if c not in removal_set]
# GH 10156
if any(isna(removals)):
@@ -1088,8 +1136,9 @@ def remove_categories(self, removals, inplace=False):
msg = "removals must all be in old categories: {not_included!s}"
raise ValueError(msg.format(not_included=not_included))
- return self.set_categories(new_categories, ordered=self.ordered,
- rename=False, inplace=inplace)
+ return self.set_categories(
+ new_categories, ordered=self.ordered, rename=False, inplace=inplace
+ )
def remove_unused_categories(self, inplace=False):
"""
@@ -1113,7 +1162,7 @@ def remove_unused_categories(self, inplace=False):
remove_categories
set_categories
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
cat = self if inplace else self.copy()
idx, inv = np.unique(cat._codes, return_inverse=True)
@@ -1121,8 +1170,9 @@ def remove_unused_categories(self, inplace=False):
idx, inv = idx[1:], inv - 1
new_categories = cat.dtype.categories.take(idx)
- new_dtype = CategoricalDtype._from_fastpath(new_categories,
- ordered=self.ordered)
+ new_dtype = CategoricalDtype._from_fastpath(
+ new_categories, ordered=self.ordered
+ )
cat._dtype = new_dtype
cat._codes = coerce_indexer_dtype(inv, new_dtype.categories)
@@ -1200,23 +1250,22 @@ def map(self, mapper):
"""
new_categories = self.categories.map(mapper)
try:
- return self.from_codes(self._codes.copy(),
- categories=new_categories,
- ordered=self.ordered)
+ return self.from_codes(
+ self._codes.copy(), categories=new_categories, ordered=self.ordered
+ )
except ValueError:
# NA values are represented in self._codes with -1
# np.take causes NA values to take final element in new_categories
if np.any(self._codes == -1):
- new_categories = new_categories.insert(len(new_categories),
- np.nan)
+ new_categories = new_categories.insert(len(new_categories), np.nan)
return np.take(new_categories, self._codes)
- __eq__ = _cat_compare_op('__eq__')
- __ne__ = _cat_compare_op('__ne__')
- __lt__ = _cat_compare_op('__lt__')
- __gt__ = _cat_compare_op('__gt__')
- __le__ = _cat_compare_op('__le__')
- __ge__ = _cat_compare_op('__ge__')
+ __eq__ = _cat_compare_op("__eq__")
+ __ne__ = _cat_compare_op("__ne__")
+ __lt__ = _cat_compare_op("__lt__")
+ __gt__ = _cat_compare_op("__gt__")
+ __le__ = _cat_compare_op("__le__")
+ __ge__ = _cat_compare_op("__ge__")
# for Series/ndarray like compat
@property
@@ -1262,9 +1311,11 @@ def shift(self, periods, fill_value=None):
elif fill_value in self.categories:
fill_value = self.categories.get_loc(fill_value)
else:
- raise ValueError("'fill_value={}' is not present "
- "in this Categorical's "
- "categories".format(fill_value))
+ raise ValueError(
+ "'fill_value={}' is not present "
+ "in this Categorical's "
+ "categories".format(fill_value)
+ )
if periods > 0:
codes[:periods] = fill_value
else:
@@ -1296,43 +1347,43 @@ def __array__(self, dtype=None):
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs)
+ self, ufunc, method, *inputs, **kwargs
+ )
if result is not NotImplemented:
return result
# for all other cases, raise for now (similarly as what happens in
# Series.__array_prepare__)
- raise TypeError("Object with dtype {dtype} cannot perform "
- "the numpy op {op}".format(
- dtype=self.dtype,
- op=ufunc.__name__))
+ raise TypeError(
+ "Object with dtype {dtype} cannot perform "
+ "the numpy op {op}".format(dtype=self.dtype, op=ufunc.__name__)
+ )
def __setstate__(self, state):
"""Necessary for making this object picklable"""
if not isinstance(state, dict):
- raise Exception('invalid pickle state')
+ raise Exception("invalid pickle state")
# Provide compatibility with pre-0.15.0 Categoricals.
- if '_categories' not in state and '_levels' in state:
- state['_categories'] = self.dtype.validate_categories(state.pop(
- '_levels'))
- if '_codes' not in state and 'labels' in state:
- state['_codes'] = coerce_indexer_dtype(
- state.pop('labels'), state['_categories'])
+ if "_categories" not in state and "_levels" in state:
+ state["_categories"] = self.dtype.validate_categories(state.pop("_levels"))
+ if "_codes" not in state and "labels" in state:
+ state["_codes"] = coerce_indexer_dtype(
+ state.pop("labels"), state["_categories"]
+ )
# 0.16.0 ordered change
- if '_ordered' not in state:
+ if "_ordered" not in state:
# >=15.0 < 0.16.0
- if 'ordered' in state:
- state['_ordered'] = state.pop('ordered')
+ if "ordered" in state:
+ state["_ordered"] = state.pop("ordered")
else:
- state['_ordered'] = False
+ state["_ordered"] = False
# 0.21.0 CategoricalDtype change
- if '_dtype' not in state:
- state['_dtype'] = CategoricalDtype(state['_categories'],
- state['_ordered'])
+ if "_dtype" not in state:
+ state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])
for k, v in state.items():
setattr(self, k, v)
@@ -1371,18 +1422,20 @@ def memory_usage(self, deep=False):
--------
numpy.ndarray.nbytes
"""
- return self._codes.nbytes + self.dtype.categories.memory_usage(
- deep=deep)
+ return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)
- @Substitution(klass='Categorical')
- @Appender(_shared_docs['searchsorted'])
- def searchsorted(self, value, side='left', sorter=None):
+ @Substitution(klass="Categorical")
+ @Appender(_shared_docs["searchsorted"])
+ def searchsorted(self, value, side="left", sorter=None):
if not self.ordered:
- raise ValueError("Categorical not ordered\nyou can use "
- ".as_ordered() to change the Categorical to an "
- "ordered one")
+ raise ValueError(
+ "Categorical not ordered\nyou can use "
+ ".as_ordered() to change the Categorical to an "
+ "ordered one"
+ )
from pandas.core.series import Series
+
codes = _get_codes_for_values(Series(value).values, self.categories)
if -1 in codes:
raise KeyError("Value(s) to be inserted must be in categories.")
@@ -1411,6 +1464,7 @@ def isna(self):
ret = self._codes == -1
return ret
+
isnull = isna
def notna(self):
@@ -1432,14 +1486,14 @@ def notna(self):
"""
return ~self.isna()
+
notnull = notna
def put(self, *args, **kwargs):
"""
Replace specific elements in the Categorical with given values.
"""
- raise NotImplementedError(("'put' is not yet implemented "
- "for Categorical"))
+ raise NotImplementedError(("'put' is not yet implemented " "for Categorical"))
def dropna(self):
"""
@@ -1489,10 +1543,9 @@ def value_counts(self, dropna=True):
count = bincount(np.where(mask, code, ncat))
ix = np.append(ix, -1)
- ix = self._constructor(ix, dtype=self.dtype,
- fastpath=True)
+ ix = self._constructor(ix, dtype=self.dtype, fastpath=True)
- return Series(count, index=CategoricalIndex(ix), dtype='int64')
+ return Series(count, index=CategoricalIndex(ix), dtype="int64")
def get_values(self):
"""
@@ -1508,8 +1561,12 @@ def get_values(self):
A numpy array of the same dtype as categorical.categories.dtype or
Index if datetime / periods.
"""
- warn("The 'get_values' method is deprecated and will be removed in a "
- "future version", FutureWarning, stacklevel=2)
+ warn(
+ "The 'get_values' method is deprecated and will be removed in a "
+ "future version",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._internal_get_values()
def _internal_get_values(self):
@@ -1517,21 +1574,22 @@ def _internal_get_values(self):
if is_datetimelike(self.categories):
return self.categories.take(self._codes, fill_value=np.nan)
elif is_integer_dtype(self.categories) and -1 in self._codes:
- return self.categories.astype("object").take(self._codes,
- fill_value=np.nan)
+ return self.categories.astype("object").take(self._codes, fill_value=np.nan)
return np.array(self)
def check_for_ordered(self, op):
""" assert that we are ordered """
if not self.ordered:
- raise TypeError("Categorical is not ordered for operation {op}\n"
- "you can use .as_ordered() to change the "
- "Categorical to an ordered one\n".format(op=op))
+ raise TypeError(
+ "Categorical is not ordered for operation {op}\n"
+ "you can use .as_ordered() to change the "
+ "Categorical to an ordered one\n".format(op=op)
+ )
def _values_for_argsort(self):
return self._codes.copy()
- def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
+ def argsort(self, ascending=True, kind="quicksort", *args, **kwargs):
"""
Return the indices that would sort the Categorical.
@@ -1584,7 +1642,7 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
"""
return super().argsort(ascending=ascending, kind=kind, *args, **kwargs)
- def sort_values(self, inplace=False, ascending=True, na_position='last'):
+ def sort_values(self, inplace=False, ascending=True, na_position="last"):
"""
Sort the Categorical by category value returning a new
Categorical by default.
@@ -1658,21 +1716,19 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
[NaN, NaN, 5.0, 2.0, 2.0]
Categories (2, int64): [2, 5]
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
- if na_position not in ['last', 'first']:
- msg = 'invalid na_position: {na_position!r}'
+ inplace = validate_bool_kwarg(inplace, "inplace")
+ if na_position not in ["last", "first"]:
+ msg = "invalid na_position: {na_position!r}"
raise ValueError(msg.format(na_position=na_position))
- sorted_idx = nargsort(self,
- ascending=ascending,
- na_position=na_position)
+ sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)
if inplace:
self._codes = self._codes[sorted_idx]
else:
- return self._constructor(values=self._codes[sorted_idx],
- dtype=self.dtype,
- fastpath=True)
+ return self._constructor(
+ values=self._codes[sorted_idx], dtype=self.dtype, fastpath=True
+ )
def _values_for_rank(self):
"""
@@ -1687,11 +1743,12 @@ def _values_for_rank(self):
"""
from pandas import Series
+
if self.ordered:
values = self.codes
mask = values == -1
if mask.any():
- values = values.astype('float64')
+ values = values.astype("float64")
values[mask] = np.nan
elif self.categories.is_numeric():
values = np.array(self)
@@ -1703,7 +1760,7 @@ def _values_for_rank(self):
)
return values
- def ravel(self, order='C'):
+ def ravel(self, order="C"):
"""
Return a flattened (numpy) array.
@@ -1713,9 +1770,12 @@ def ravel(self, order='C'):
-------
numpy.array
"""
- warn("Categorical.ravel will return a Categorical object instead "
- "of an ndarray in a future version.",
- FutureWarning, stacklevel=2)
+ warn(
+ "Categorical.ravel will return a Categorical object instead "
+ "of an ndarray in a future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
return np.array(self)
def view(self):
@@ -1743,7 +1803,7 @@ def to_dense(self):
"""
return np.asarray(self)
- @deprecate_kwarg(old_arg_name='fill_value', new_arg_name='value')
+ @deprecate_kwarg(old_arg_name="fill_value", new_arg_name="value")
def fillna(self, value=None, method=None, limit=None):
"""
Fill NA/NaN values using the specified method.
@@ -1780,8 +1840,9 @@ def fillna(self, value=None, method=None, limit=None):
if value is None:
value = np.nan
if limit is not None:
- raise NotImplementedError("specifying a limit for fillna has not "
- "been implemented yet")
+ raise NotImplementedError(
+ "specifying a limit for fillna has not " "been implemented yet"
+ )
codes = self._codes
@@ -1789,8 +1850,9 @@ def fillna(self, value=None, method=None, limit=None):
if method is not None:
values = self.to_dense().reshape(-1, len(self))
- values = interpolate_2d(values, method, 0, None,
- value).astype(self.categories.dtype)[0]
+ values = interpolate_2d(values, method, 0, None, value).astype(
+ self.categories.dtype
+ )[0]
codes = _get_codes_for_values(values, self.categories)
else:
@@ -1819,9 +1881,11 @@ def fillna(self, value=None, method=None, limit=None):
codes[mask] = self.categories.get_loc(value)
else:
- raise TypeError('"value" parameter must be a scalar, dict '
- 'or Series, but you passed a '
- '"{0}"'.format(type(value).__name__))
+ raise TypeError(
+ '"value" parameter must be a scalar, dict '
+ "or Series, but you passed a "
+ '"{0}"'.format(type(value).__name__)
+ )
return self._constructor(codes, dtype=self.dtype, fastpath=True)
@@ -1913,14 +1977,10 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):
if fill_value in self.categories:
fill_value = self.categories.get_loc(fill_value)
else:
- msg = (
- "'fill_value' ('{}') is not in this Categorical's "
- "categories."
- )
+ msg = "'fill_value' ('{}') is not in this Categorical's " "categories."
raise TypeError(msg.format(fill_value))
- codes = take(self._codes, indexer, allow_fill=allow_fill,
- fill_value=fill_value)
+ codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value)
result = type(self).from_codes(codes, dtype=dtype)
return result
@@ -1937,8 +1997,7 @@ def _slice(self, slicer):
# in a 2-d case be passd (slice(None),....)
if isinstance(slicer, tuple) and len(slicer) == 2:
if not com.is_null_slice(slicer[0]):
- raise AssertionError("invalid slicing for a 1-ndim "
- "categorical")
+ raise AssertionError("invalid slicing for a 1-ndim " "categorical")
slicer = slicer[1]
codes = self._codes[slicer]
@@ -1972,12 +2031,13 @@ def _tidy_repr(self, max_vals=10, footer=True):
"""
num = max_vals // 2
head = self[:num]._get_repr(length=False, footer=False)
- tail = self[-(max_vals - num):]._get_repr(length=False, footer=False)
+ tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False)
- result = '{head}, ..., {tail}'.format(head=head[:-1], tail=tail[1:])
+ result = "{head}, ..., {tail}".format(head=head[:-1], tail=tail[1:])
if footer:
- result = '{result}\n{footer}'.format(
- result=result, footer=self._repr_footer())
+ result = "{result}\n{footer}".format(
+ result=result, footer=self._repr_footer()
+ )
return str(result)
@@ -1985,9 +2045,13 @@ def _repr_categories(self):
"""
return the base repr for the categories
"""
- max_categories = (10 if get_option("display.max_categories") == 0 else
- get_option("display.max_categories"))
+ max_categories = (
+ 10
+ if get_option("display.max_categories") == 0
+ else get_option("display.max_categories")
+ )
from pandas.io.formats import format as fmt
+
if len(self.categories) > max_categories:
num = max_categories // 2
head = fmt.format_array(self.categories[:num], None)
@@ -2008,7 +2072,8 @@ def _repr_categories_info(self):
category_strs = self._repr_categories()
dtype = str(self.categories.dtype)
levheader = "Categories ({length}, {dtype}): ".format(
- length=len(self.categories), dtype=dtype)
+ length=len(self.categories), dtype=dtype
+ )
width, height = get_terminal_size()
max_width = get_option("display.width") or width
if console.in_ipython_frontend():
@@ -2033,13 +2098,16 @@ def _repr_categories_info(self):
def _repr_footer(self):
- return 'Length: {length}\n{info}'.format(
- length=len(self), info=self._repr_categories_info())
+ return "Length: {length}\n{info}".format(
+ length=len(self), info=self._repr_categories_info()
+ )
- def _get_repr(self, length=True, na_rep='NaN', footer=True):
+ def _get_repr(self, length=True, na_rep="NaN", footer=True):
from pandas.io.formats import format as fmt
- formatter = fmt.CategoricalFormatter(self, length=length,
- na_rep=na_rep, footer=footer)
+
+ formatter = fmt.CategoricalFormatter(
+ self, length=length, na_rep=na_rep, footer=footer
+ )
result = formatter.to_string()
return str(result)
@@ -2054,7 +2122,7 @@ def __repr__(self):
result = self._get_repr(length=len(self) > _maxlen)
else:
msg = self._get_repr(length=False, footer=True).replace("\n", ", ")
- result = ('[], {repr_msg}'.format(repr_msg=msg))
+ result = "[], {repr_msg}".format(repr_msg=msg)
return result
@@ -2062,7 +2130,7 @@ def _maybe_coerce_indexer(self, indexer):
"""
return an indexer coerced to the codes dtype
"""
- if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i':
+ if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i":
indexer = indexer.astype(self._codes.dtype)
return indexer
@@ -2077,8 +2145,9 @@ def __getitem__(self, key):
else:
return self.categories[i]
else:
- return self._constructor(values=self._codes[key],
- dtype=self.dtype, fastpath=True)
+ return self._constructor(
+ values=self._codes[key], dtype=self.dtype, fastpath=True
+ )
def __setitem__(self, key, value):
"""
@@ -2098,8 +2167,10 @@ def __setitem__(self, key, value):
# require identical categories set
if isinstance(value, Categorical):
if not is_dtype_equal(self, value):
- raise ValueError("Cannot set a Categorical with another, "
- "without identical categories")
+ raise ValueError(
+ "Cannot set a Categorical with another, "
+ "without identical categories"
+ )
if not self.categories.equals(value.categories):
new_codes = _recode_for_categories(
value.codes, value.categories, self.categories
@@ -2109,13 +2180,16 @@ def __setitem__(self, key, value):
rvalue = value if is_list_like(value) else [value]
from pandas import Index
+
to_add = Index(rvalue).difference(self.categories)
# no assignments of values not in categories, but it's always ok to set
# something to np.nan
if len(to_add) and not isna(to_add).all():
- raise ValueError("Cannot setitem on a Categorical with a new "
- "category, set the categories first")
+ raise ValueError(
+ "Cannot setitem on a Categorical with a new "
+ "category, set the categories first"
+ )
# set by position
if isinstance(key, (int, np.integer)):
@@ -2127,14 +2201,12 @@ def __setitem__(self, key, value):
# in a 2-d case be passd (slice(None),....)
if len(key) == 2:
if not com.is_null_slice(key[0]):
- raise AssertionError("invalid slicing for a 1-ndim "
- "categorical")
+ raise AssertionError("invalid slicing for a 1-ndim " "categorical")
key = key[1]
elif len(key) == 1:
key = key[0]
else:
- raise AssertionError("invalid slicing for a 1-ndim "
- "categorical")
+ raise AssertionError("invalid slicing for a 1-ndim " "categorical")
# slicing in Series or Categorical
elif isinstance(key, slice):
@@ -2172,8 +2244,9 @@ def _reverse_indexer(self):
"""
categories = self.categories
- r, counts = libalgos.groupsort_indexer(self.codes.astype('int64'),
- categories.size)
+ r, counts = libalgos.groupsort_indexer(
+ self.codes.astype("int64"), categories.size
+ )
counts = counts.cumsum()
result = (r[start:end] for start, end in zip(counts, counts[1:]))
result = dict(zip(categories, result))
@@ -2183,7 +2256,7 @@ def _reverse_indexer(self):
def _reduce(self, name, axis=0, **kwargs):
func = getattr(self, name, None)
if func is None:
- msg = 'Categorical cannot perform the operation {op}'
+ msg = "Categorical cannot perform the operation {op}"
raise TypeError(msg.format(op=name))
return func(**kwargs)
@@ -2202,7 +2275,7 @@ def min(self, numeric_only=None, **kwargs):
-------
min : the minimum of this `Categorical`
"""
- self.check_for_ordered('min')
+ self.check_for_ordered("min")
if numeric_only:
good = self._codes != -1
pointer = self._codes[good].min(**kwargs)
@@ -2228,7 +2301,7 @@ def max(self, numeric_only=None, **kwargs):
-------
max : the maximum of this `Categorical`
"""
- self.check_for_ordered('max')
+ self.check_for_ordered("max")
if numeric_only:
good = self._codes != -1
pointer = self._codes[good].max(**kwargs)
@@ -2258,6 +2331,7 @@ def mode(self, dropna=True):
"""
import pandas._libs.hashtable as htable
+
codes = self._codes
if dropna:
good = self._codes != -1
@@ -2322,13 +2396,14 @@ def unique(self):
return cat.set_categories(cat.categories.take(take_codes))
def _values_for_factorize(self):
- codes = self.codes.astype('int64')
+ codes = self.codes.astype("int64")
return codes, -1
@classmethod
def _from_factorized(cls, uniques, original):
- return original._constructor(original.categories.take(uniques),
- dtype=original.dtype)
+ return original._constructor(
+ original.categories.take(uniques), dtype=original.dtype
+ )
def equals(self, other):
"""
@@ -2347,9 +2422,9 @@ def equals(self, other):
# fastpath to avoid re-coding
other_codes = other._codes
else:
- other_codes = _recode_for_categories(other.codes,
- other.categories,
- self.categories)
+ other_codes = _recode_for_categories(
+ other.codes, other.categories, self.categories
+ )
return np.array_equal(self._codes, other_codes)
return False
@@ -2385,14 +2460,15 @@ def describe(self):
freqs = counts / float(counts.sum())
from pandas.core.reshape.concat import concat
+
result = concat([counts, freqs], axis=1)
- result.columns = ['counts', 'freqs']
- result.index.name = 'categories'
+ result.columns = ["counts", "freqs"]
+ result.index.name = "categories"
return result
- @Substitution(klass='Categorical')
- @Appender(_extension_array_shared_docs['repeat'])
+ @Substitution(klass="Categorical")
+ @Appender(_extension_array_shared_docs["repeat"])
def repeat(self, repeats, axis=None):
nv.validate_repeat(tuple(), dict(axis=axis))
codes = self._codes.repeat(repeats)
@@ -2452,10 +2528,14 @@ def isin(self, values):
array([ True, False, True, False, True, False])
"""
from pandas.core.internals.construction import sanitize_array
+
if not is_list_like(values):
- raise TypeError("only list-like objects are allowed to be passed"
- " to isin(), you passed a [{values_type}]"
- .format(values_type=type(values).__name__))
+ raise TypeError(
+ "only list-like objects are allowed to be passed"
+ " to isin(), you passed a [{values_type}]".format(
+ values_type=type(values).__name__
+ )
+ )
values = sanitize_array(values, None, None)
null_mask = np.asarray(isna(values))
code_values = self.categories.get_indexer(values)
@@ -2466,15 +2546,23 @@ def isin(self, values):
# The Series.cat accessor
-@delegate_names(delegate=Categorical,
- accessors=["categories", "ordered"],
- typ="property")
-@delegate_names(delegate=Categorical,
- accessors=["rename_categories", "reorder_categories",
- "add_categories", "remove_categories",
- "remove_unused_categories", "set_categories",
- "as_ordered", "as_unordered"],
- typ="method")
+@delegate_names(
+ delegate=Categorical, accessors=["categories", "ordered"], typ="property"
+)
+@delegate_names(
+ delegate=Categorical,
+ accessors=[
+ "rename_categories",
+ "reorder_categories",
+ "add_categories",
+ "remove_categories",
+ "remove_unused_categories",
+ "set_categories",
+ "as_ordered",
+ "as_unordered",
+ ],
+ typ="method",
+)
class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
"""
Accessor object for categorical properties of the Series values.
@@ -2511,8 +2599,9 @@ def __init__(self, data):
@staticmethod
def _validate(data):
if not is_categorical_dtype(data.dtype):
- raise AttributeError("Can only use .cat accessor with a "
- "'category' dtype")
+ raise AttributeError(
+ "Can only use .cat accessor with a " "'category' dtype"
+ )
def _delegate_property_get(self, name):
return getattr(self._parent, name)
@@ -2526,10 +2615,12 @@ def codes(self):
Return Series of codes as well as the index.
"""
from pandas import Series
+
return Series(self._parent.codes, index=self._index)
def _delegate_method(self, name, *args, **kwargs):
from pandas import Series
+
method = getattr(self._parent, name)
res = method(*args, **kwargs)
if res is not None:
@@ -2540,10 +2631,12 @@ def categorical(self):
# Note: Upon deprecation, `test_tab_completion_with_categorical` will
# need to be updated. `categorical` will need to be removed from
# `ok_for_cat`.
- warn("`Series.cat.categorical` has been deprecated. Use the "
- "attributes on 'Series.cat' directly instead.",
- FutureWarning,
- stacklevel=2)
+ warn(
+ "`Series.cat.categorical` has been deprecated. Use the "
+ "attributes on 'Series.cat' directly instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._parent
@property
@@ -2551,10 +2644,11 @@ def name(self):
# Note: Upon deprecation, `test_tab_completion_with_categorical` will
# need to be updated. `name` will need to be removed from
# `ok_for_cat`.
- warn("`Series.cat.name` has been deprecated. Use `Series.name` "
- "instead.",
- FutureWarning,
- stacklevel=2)
+ warn(
+ "`Series.cat.name` has been deprecated. Use `Series.name` " "instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._name
@property
@@ -2562,12 +2656,14 @@ def index(self):
# Note: Upon deprecation, `test_tab_completion_with_categorical` will
# need to be updated. `index` will need to be removed from
# ok_for_cat`.
- warn("`Series.cat.index` has been deprecated. Use `Series.index` "
- "instead.",
- FutureWarning,
- stacklevel=2)
+ warn(
+ "`Series.cat.index` has been deprecated. Use `Series.index` " "instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._index
+
# utility routines
@@ -2576,22 +2672,20 @@ def _get_codes_for_values(values, categories):
utility routine to turn values into codes given the specified categories
"""
from pandas.core.algorithms import _get_data_algo, _hashtables
+
dtype_equal = is_dtype_equal(values.dtype, categories.dtype)
if dtype_equal:
# To prevent erroneous dtype coercion in _get_data_algo, retrieve
# the underlying numpy array. gh-22702
- values = getattr(values, '_ndarray_values', values)
- categories = getattr(categories, '_ndarray_values', categories)
- elif (is_extension_array_dtype(categories.dtype) and
- is_object_dtype(values)):
+ values = getattr(values, "_ndarray_values", values)
+ categories = getattr(categories, "_ndarray_values", categories)
+ elif is_extension_array_dtype(categories.dtype) and is_object_dtype(values):
# Support inferring the correct extension dtype from an array of
# scalar objects. e.g.
# Categorical(array[Period, Period], categories=PeriodIndex(...))
try:
- values = (
- categories.dtype.construct_array_type()._from_sequence(values)
- )
+ values = categories.dtype.construct_array_type()._from_sequence(values)
except Exception:
# but that may fail for any reason, so fall back to object
values = ensure_object(values)
@@ -2636,8 +2730,9 @@ def _recode_for_categories(codes, old_categories, new_categories):
elif new_categories.equals(old_categories):
# Same categories, so no need to actually recode
return codes.copy()
- indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories),
- new_categories)
+ indexer = coerce_indexer_dtype(
+ new_categories.get_indexer(old_categories), new_categories
+ )
new_codes = take_1d(indexer, codes.copy(), fill_value=-1)
return new_codes
@@ -2647,8 +2742,7 @@ def _convert_to_list_like(list_like):
return list_like
if isinstance(list_like, list):
return list_like
- if (is_sequence(list_like) or isinstance(list_like, tuple) or
- is_iterator(list_like)):
+ if is_sequence(list_like) or isinstance(list_like, tuple) or is_iterator(list_like):
return list(list_like)
elif is_scalar(list_like):
return [list_like]
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 93166759d8dbd..540442b7eaed4 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -7,30 +7,40 @@
from pandas._libs import NaT, NaTType, Timestamp, algos, iNaT, lib
from pandas._libs.tslibs.c_timestamp import maybe_integer_op_deprecated
-from pandas._libs.tslibs.period import (
- DIFFERENT_FREQ, IncompatibleFrequency, Period)
+from pandas._libs.tslibs.period import DIFFERENT_FREQ, IncompatibleFrequency, Period
from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds
from pandas._libs.tslibs.timestamps import RoundTo, round_nsint64
from pandas.compat.numpy import function as nv
-from pandas.errors import (
- AbstractMethodError, NullFrequencyError, PerformanceWarning)
+from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning
from pandas.util._decorators import Appender, Substitution
from pandas.util._validators import validate_fillna_kwargs
from pandas.core.dtypes.common import (
- is_categorical_dtype, is_datetime64_any_dtype, is_datetime64_dtype,
- is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal,
- is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_list_like,
- is_object_dtype, is_offsetlike, is_period_dtype, is_string_dtype,
- is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype)
+ is_categorical_dtype,
+ is_datetime64_any_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_datetime_or_timedelta_dtype,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_float_dtype,
+ is_integer_dtype,
+ is_list_like,
+ is_object_dtype,
+ is_offsetlike,
+ is_period_dtype,
+ is_string_dtype,
+ is_timedelta64_dtype,
+ is_unsigned_integer_dtype,
+ pandas_dtype,
+)
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.inference import is_array_like
from pandas.core.dtypes.missing import isna
from pandas._typing import DatetimeLikeScalar
from pandas.core import missing, nanops
-from pandas.core.algorithms import (
- checked_add_with_arr, take, unique1d, value_counts)
+from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts
import pandas.core.common as com
from pandas.tseries import frequencies
@@ -68,8 +78,7 @@ def _scalar_type(self) -> Type[DatetimeLikeScalar]:
raise AbstractMethodError(self)
def _scalar_from_string(
- self,
- value: str,
+ self, value: str
) -> Union[Period, Timestamp, Timedelta, NaTType]:
"""
Construct a scalar type from a string.
@@ -90,10 +99,7 @@ def _scalar_from_string(
"""
raise AbstractMethodError(self)
- def _unbox_scalar(
- self,
- value: Union[Period, Timestamp, Timedelta, NaTType],
- ) -> int:
+ def _unbox_scalar(self, value: Union[Period, Timestamp, Timedelta, NaTType]) -> int:
"""
Unbox the integer value of a scalar `value`.
@@ -113,8 +119,7 @@ def _unbox_scalar(
raise AbstractMethodError(self)
def _check_compatible_with(
- self,
- other: Union[Period, Timestamp, Timedelta, NaTType],
+ self, other: Union[Period, Timestamp, Timedelta, NaTType]
) -> None:
"""
Verify that `self` and `other` are compatible.
@@ -141,8 +146,10 @@ class DatelikeOps:
Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex.
"""
- @Substitution(URL="https://docs.python.org/3/library/datetime.html"
- "#strftime-and-strptime-behavior")
+ @Substitution(
+ URL="https://docs.python.org/3/library/datetime.html"
+ "#strftime-and-strptime-behavior"
+ )
def strftime(self, date_format):
"""
Convert to Index using specified date_format.
@@ -179,6 +186,7 @@ def strftime(self, date_format):
dtype='object')
"""
from pandas import Index
+
return Index(self._format_native_types(date_format=date_format))
@@ -187,8 +195,7 @@ class TimelikeOps:
Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex.
"""
- _round_doc = (
- """
+ _round_doc = """
Perform {op} operation on the data to the specified `freq`.
Parameters
@@ -247,10 +254,9 @@ class TimelikeOps:
DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00',
'2018-01-01 12:01:00'],
dtype='datetime64[ns]', freq='T')
- """)
+ """
- _round_example = (
- """>>> rng.round('H')
+ _round_example = """>>> rng.round('H')
DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00',
'2018-01-01 12:00:00'],
dtype='datetime64[ns]', freq=None)
@@ -262,10 +268,9 @@ class TimelikeOps:
1 2018-01-01 12:00:00
2 2018-01-01 12:00:00
dtype: datetime64[ns]
- """)
+ """
- _floor_example = (
- """>>> rng.floor('H')
+ _floor_example = """>>> rng.floor('H')
DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00',
'2018-01-01 12:00:00'],
dtype='datetime64[ns]', freq=None)
@@ -278,10 +283,8 @@ class TimelikeOps:
2 2018-01-01 12:00:00
dtype: datetime64[ns]
"""
- )
- _ceil_example = (
- """>>> rng.ceil('H')
+ _ceil_example = """>>> rng.ceil('H')
DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00',
'2018-01-01 13:00:00'],
dtype='datetime64[ns]', freq=None)
@@ -294,7 +297,6 @@ class TimelikeOps:
2 2018-01-01 13:00:00
dtype: datetime64[ns]
"""
- )
def _round(self, freq, mode, ambiguous, nonexistent):
# round the local times
@@ -310,23 +312,19 @@ def _round(self, freq, mode, ambiguous, nonexistent):
)
@Appender((_round_doc + _round_example).format(op="round"))
- def round(self, freq, ambiguous='raise', nonexistent='raise'):
- return self._round(
- freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent
- )
+ def round(self, freq, ambiguous="raise", nonexistent="raise"):
+ return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent)
@Appender((_round_doc + _floor_example).format(op="floor"))
- def floor(self, freq, ambiguous='raise', nonexistent='raise'):
+ def floor(self, freq, ambiguous="raise", nonexistent="raise"):
return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent)
@Appender((_round_doc + _ceil_example).format(op="ceil"))
- def ceil(self, freq, ambiguous='raise', nonexistent='raise'):
+ def ceil(self, freq, ambiguous="raise", nonexistent="raise"):
return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent)
-class DatetimeLikeArrayMixin(ExtensionOpsMixin,
- AttributesMixin,
- ExtensionArray):
+class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray):
"""
Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray
@@ -365,7 +363,7 @@ def asi8(self) -> np.ndarray:
An ndarray with int64 dtype.
"""
# do not cache or you'll create a memory leak
- return self._data.view('i8')
+ return self._data.view("i8")
@property
def _ndarray_values(self):
@@ -374,7 +372,7 @@ def _ndarray_values(self):
# ----------------------------------------------------------------
# Rendering Methods
- def _format_native_types(self, na_rep='NaT', date_format=None):
+ def _format_native_types(self, na_rep="NaT", date_format=None):
"""
Helper method for astype when converting to strings.
@@ -417,9 +415,11 @@ def __getitem__(self, key):
is_int = lib.is_integer(key)
if lib.is_scalar(key) and not is_int:
- raise IndexError("only integers, slices (`:`), ellipsis (`...`), "
- "numpy.newaxis (`None`) and integer or boolean "
- "arrays are valid indices")
+ raise IndexError(
+ "only integers, slices (`:`), ellipsis (`...`), "
+ "numpy.newaxis (`None`) and integer or boolean "
+ "arrays are valid indices"
+ )
getitem = self._data.__getitem__
if is_int:
@@ -459,9 +459,9 @@ def __getitem__(self, key):
return self._simple_new(result, dtype=self.dtype, freq=freq)
def __setitem__(
- self,
- key: Union[int, Sequence[int], Sequence[bool], slice],
- value: Union[NaTType, Any, Sequence[Any]]
+ self,
+ key: Union[int, Sequence[int], Sequence[bool], slice],
+ value: Union[NaTType, Any, Sequence[Any]],
) -> None:
# I'm fudging the types a bit here. "Any" above really depends
# on type(self). For PeriodArray, it's Period (or stuff coercible
@@ -477,12 +477,12 @@ def __setitem__(
if not is_slice:
key = cast(Sequence, key)
- if (len(key) != len(value)
- and not com.is_bool_indexer(key)):
- msg = ("shape mismatch: value array of length '{}' does "
- "not match indexing result of length '{}'.")
- raise ValueError(msg.format(
- len(key), len(value)))
+ if len(key) != len(value) and not com.is_bool_indexer(key):
+ msg = (
+ "shape mismatch: value array of length '{}' does "
+ "not match indexing result of length '{}'."
+ )
+ raise ValueError(msg.format(len(key), len(value)))
elif not len(key):
return
@@ -499,8 +499,9 @@ def __setitem__(
"'value' should be a '{scalar}', 'NaT', or array of those. "
"Got '{typ}' instead."
)
- raise TypeError(msg.format(scalar=self._scalar_type.__name__,
- typ=type(value).__name__))
+ raise TypeError(
+ msg.format(scalar=self._scalar_type.__name__, typ=type(value).__name__)
+ )
self._data[key] = value
self._maybe_clear_freq()
@@ -515,6 +516,7 @@ def astype(self, dtype, copy=True):
# 2. DatetimeArray.astype handles conversion between tz.
# 3. DatetimeArray.astype handles datetime -> period
from pandas import Categorical
+
dtype = pandas_dtype(dtype)
if is_object_dtype(dtype):
@@ -533,11 +535,13 @@ def astype(self, dtype, copy=True):
if copy:
values = values.copy()
return values
- elif (is_datetime_or_timedelta_dtype(dtype) and
- not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype):
+ elif (
+ is_datetime_or_timedelta_dtype(dtype)
+ and not is_dtype_equal(self.dtype, dtype)
+ ) or is_float_dtype(dtype):
# disallow conversion between datetime/timedelta,
# and conversions for any datetimelike to float
- msg = 'Cannot cast {name} to dtype {dtype}'
+ msg = "Cannot cast {name} to dtype {dtype}"
raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
elif is_categorical_dtype(dtype):
return Categorical(self, dtype=dtype)
@@ -589,10 +593,9 @@ def take(self, indices, allow_fill=False, fill_value=None):
if allow_fill:
fill_value = self._validate_fill_value(fill_value)
- new_values = take(self.asi8,
- indices,
- allow_fill=allow_fill,
- fill_value=fill_value)
+ new_values = take(
+ self.asi8, indices, allow_fill=allow_fill, fill_value=fill_value
+ )
return type(self)(new_values, dtype=self.dtype)
@@ -624,7 +627,7 @@ def _values_for_argsort(self):
# These are not part of the EA API, but we implement them because
# pandas assumes they're there.
- def searchsorted(self, value, side='left', sorter=None):
+ def searchsorted(self, value, side="left", sorter=None):
"""
Find indices where elements should be inserted to maintain order.
@@ -652,10 +655,10 @@ def searchsorted(self, value, side='left', sorter=None):
if isinstance(value, str):
value = self._scalar_from_string(value)
- if not (isinstance(value, (self._scalar_type, type(self)))
- or isna(value)):
- raise ValueError("Unexpected type for 'value': {valtype}"
- .format(valtype=type(value)))
+ if not (isinstance(value, (self._scalar_type, type(self))) or isna(value)):
+ raise ValueError(
+ "Unexpected type for 'value': {valtype}".format(valtype=type(value))
+ )
self._check_compatible_with(value)
if isinstance(value, type(self)):
@@ -675,7 +678,7 @@ def repeat(self, repeats, *args, **kwargs):
"""
nv.validate_repeat(args, kwargs)
values = self._data.repeat(repeats)
- return type(self)(values.view('i8'), dtype=self.dtype)
+ return type(self)(values.view("i8"), dtype=self.dtype)
def value_counts(self, dropna=False):
"""
@@ -700,8 +703,9 @@ def value_counts(self, dropna=False):
cls = type(self)
result = value_counts(values, sort=False, dropna=dropna)
- index = Index(cls(result.index.view('i8'), dtype=self.dtype),
- name=result.index.name)
+ index = Index(
+ cls(result.index.view("i8"), dtype=self.dtype), name=result.index.name
+ )
return Series(result.values, index=index, name=result.name)
def map(self, mapper):
@@ -725,7 +729,7 @@ def _isnan(self):
"""
return if each value is nan
"""
- return (self.asi8 == iNaT)
+ return self.asi8 == iNaT
@property # NB: override with cache_readonly in immutable subclasses
def _hasnans(self):
@@ -773,13 +777,15 @@ def fillna(self, value=None, method=None, limit=None):
if is_array_like(value):
if len(value) != len(self):
- raise ValueError("Length of 'value' does not match. Got ({}) "
- " expected {}".format(len(value), len(self)))
+ raise ValueError(
+ "Length of 'value' does not match. Got ({}) "
+ " expected {}".format(len(value), len(self))
+ )
value = value[mask]
if mask.any():
if method is not None:
- if method == 'pad':
+ if method == "pad":
func = missing.pad_1d
else:
func = missing.backfill_1d
@@ -791,8 +797,7 @@ def fillna(self, value=None, method=None, limit=None):
# to avoid modifying `self` in-place.
values = values.copy()
- new_values = func(values, limit=limit,
- mask=mask)
+ new_values = func(values, limit=limit, mask=mask)
if is_datetime64tz_dtype(self):
# we need to pass int64 values to the constructor to avoid
# re-localizing incorrectly
@@ -878,9 +883,9 @@ def _validate_frequency(cls, index, freq, **kwargs):
return None
try:
- on_freq = cls._generate_range(start=index[0], end=None,
- periods=len(index), freq=freq,
- **kwargs)
+ on_freq = cls._generate_range(
+ start=index[0], end=None, periods=len(index), freq=freq, **kwargs
+ )
if not np.array_equal(index.asi8, on_freq.asi8):
raise ValueError
except ValueError as e:
@@ -893,9 +898,12 @@ def _validate_frequency(cls, index, freq, **kwargs):
# is `NaT`, in which case the call to `cls._generate_range` will
# raise a ValueError, which we re-raise with a more targeted
# message.
- raise ValueError('Inferred frequency {infer} from passed values '
- 'does not conform to passed frequency {passed}'
- .format(infer=inferred, passed=freq.freqstr))
+ raise ValueError(
+ "Inferred frequency {infer} from passed values "
+ "does not conform to passed frequency {passed}".format(
+ infer=inferred, passed=freq.freqstr
+ )
+ )
# monotonicity/uniqueness properties are called via frequencies.infer_freq,
# see GH#23789
@@ -917,24 +925,28 @@ def _is_unique(self):
def _add_datetimelike_scalar(self, other):
# Overriden by TimedeltaArray
- raise TypeError("cannot add {cls} and {typ}"
- .format(cls=type(self).__name__,
- typ=type(other).__name__))
+ raise TypeError(
+ "cannot add {cls} and {typ}".format(
+ cls=type(self).__name__, typ=type(other).__name__
+ )
+ )
_add_datetime_arraylike = _add_datetimelike_scalar
def _sub_datetimelike_scalar(self, other):
# Overridden by DatetimeArray
assert other is not NaT
- raise TypeError("cannot subtract a datelike from a {cls}"
- .format(cls=type(self).__name__))
+ raise TypeError(
+ "cannot subtract a datelike from a {cls}".format(cls=type(self).__name__)
+ )
_sub_datetime_arraylike = _sub_datetimelike_scalar
def _sub_period(self, other):
# Overriden by PeriodArray
- raise TypeError("cannot subtract Period from a {cls}"
- .format(cls=type(self).__name__))
+ raise TypeError(
+ "cannot subtract Period from a {cls}".format(cls=type(self).__name__)
+ )
def _add_offset(self, offset):
raise AbstractMethodError(self)
@@ -973,15 +985,16 @@ def _add_timedeltalike_scalar(self, other):
"""
if isna(other):
# i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds
- new_values = np.empty(len(self), dtype='i8')
+ new_values = np.empty(len(self), dtype="i8")
new_values[:] = iNaT
return new_values
inc = delta_to_nanoseconds(other)
- new_values = checked_add_with_arr(self.asi8, inc,
- arr_mask=self._isnan).view('i8')
+ new_values = checked_add_with_arr(self.asi8, inc, arr_mask=self._isnan).view(
+ "i8"
+ )
new_values = self._maybe_mask_results(new_values)
- return new_values.view('i8')
+ return new_values.view("i8")
def _add_delta_tdi(self, other):
"""
@@ -994,26 +1007,29 @@ def _add_delta_tdi(self, other):
if isinstance(other, np.ndarray):
# ndarray[timedelta64]; wrap in TimedeltaIndex for op
from pandas import TimedeltaIndex
+
other = TimedeltaIndex(other)
self_i8 = self.asi8
other_i8 = other.asi8
- new_values = checked_add_with_arr(self_i8, other_i8,
- arr_mask=self._isnan,
- b_mask=other._isnan)
+ new_values = checked_add_with_arr(
+ self_i8, other_i8, arr_mask=self._isnan, b_mask=other._isnan
+ )
if self._hasnans or other._hasnans:
mask = (self._isnan) | (other._isnan)
new_values[mask] = iNaT
- return new_values.view('i8')
+ return new_values.view("i8")
def _add_nat(self):
"""
Add pd.NaT to self
"""
if is_period_dtype(self):
- raise TypeError('Cannot add {cls} and {typ}'
- .format(cls=type(self).__name__,
- typ=type(NaT).__name__))
+ raise TypeError(
+ "Cannot add {cls} and {typ}".format(
+ cls=type(self).__name__, typ=type(NaT).__name__
+ )
+ )
# GH#19124 pd.NaT is treated like a timedelta for both timedelta
# and datetime dtypes
@@ -1033,7 +1049,7 @@ def _sub_nat(self):
# For period dtype, timedelta64 is a close-enough return dtype.
result = np.zeros(len(self), dtype=np.int64)
result.fill(iNaT)
- return result.view('timedelta64[ns]')
+ return result.view("timedelta64[ns]")
def _sub_period_array(self, other):
"""
@@ -1051,22 +1067,23 @@ def _sub_period_array(self, other):
Array of DateOffset objects; nulls represented by NaT.
"""
if not is_period_dtype(self):
- raise TypeError("cannot subtract {dtype}-dtype from {cls}"
- .format(dtype=other.dtype,
- cls=type(self).__name__))
+ raise TypeError(
+ "cannot subtract {dtype}-dtype from {cls}".format(
+ dtype=other.dtype, cls=type(self).__name__
+ )
+ )
if len(self) != len(other):
- raise ValueError("cannot subtract arrays/indices of "
- "unequal length")
+ raise ValueError("cannot subtract arrays/indices of " "unequal length")
if self.freq != other.freq:
- msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
- own_freq=self.freqstr,
- other_freq=other.freqstr)
+ msg = DIFFERENT_FREQ.format(
+ cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr
+ )
raise IncompatibleFrequency(msg)
- new_values = checked_add_with_arr(self.asi8, -other.asi8,
- arr_mask=self._isnan,
- b_mask=other._isnan)
+ new_values = checked_add_with_arr(
+ self.asi8, -other.asi8, arr_mask=self._isnan, b_mask=other._isnan
+ )
new_values = np.array([self.freq.base * x for x in new_values])
if self._hasnans or other._hasnans:
@@ -1125,17 +1142,19 @@ def _addsub_offset_array(self, other, op):
if len(other) == 1:
return op(self, other[0])
- warnings.warn("Adding/subtracting array of DateOffsets to "
- "{cls} not vectorized"
- .format(cls=type(self).__name__), PerformanceWarning)
+ warnings.warn(
+ "Adding/subtracting array of DateOffsets to "
+ "{cls} not vectorized".format(cls=type(self).__name__),
+ PerformanceWarning,
+ )
# For EA self.astype('O') returns a numpy array, not an Index
- left = lib.values_from_object(self.astype('O'))
+ left = lib.values_from_object(self.astype("O"))
res_values = op(left, np.array(other))
kwargs = {}
if not is_period_dtype(self):
- kwargs['freq'] = 'infer'
+ kwargs["freq"] = "infer"
return self._from_sequence(res_values, **kwargs)
def _time_shift(self, periods, freq=None):
@@ -1173,8 +1192,7 @@ def _time_shift(self, periods, freq=None):
# Note: in the DatetimeTZ case, _generate_range will infer the
# appropriate timezone from `start` and `end`, so tz does not need
# to be passed explicitly.
- return self._generate_range(start=start, end=end, periods=None,
- freq=self.freq)
+ return self._generate_range(start=start, end=end, periods=None, freq=self.freq)
def __add__(self, other):
other = lib.item_from_zerodim(other)
@@ -1214,9 +1232,11 @@ def __add__(self, other):
result = self._addsub_int_array(other, operator.add)
elif is_float_dtype(other):
# Explicitly catch invalid dtypes
- raise TypeError("cannot add {dtype}-dtype to {cls}"
- .format(dtype=other.dtype,
- cls=type(self).__name__))
+ raise TypeError(
+ "cannot add {dtype}-dtype to {cls}".format(
+ dtype=other.dtype, cls=type(self).__name__
+ )
+ )
elif is_period_dtype(other):
# if self is a TimedeltaArray and other is a PeriodArray with
# a timedelta-like (i.e. Tick) freq, this operation is valid.
@@ -1231,6 +1251,7 @@ def __add__(self, other):
if is_timedelta64_dtype(result) and isinstance(result, np.ndarray):
from pandas.core.arrays import TimedeltaArray
+
# TODO: infer freq?
return TimedeltaArray(result)
return result
@@ -1282,14 +1303,18 @@ def __sub__(self, other):
maybe_integer_op_deprecated(self)
result = self._addsub_int_array(other, operator.sub)
elif isinstance(other, ABCIndexClass):
- raise TypeError("cannot subtract {cls} and {typ}"
- .format(cls=type(self).__name__,
- typ=type(other).__name__))
+ raise TypeError(
+ "cannot subtract {cls} and {typ}".format(
+ cls=type(self).__name__, typ=type(other).__name__
+ )
+ )
elif is_float_dtype(other):
# Explicitly catch invalid dtypes
- raise TypeError("cannot subtract {dtype}-dtype from {cls}"
- .format(dtype=other.dtype,
- cls=type(self).__name__))
+ raise TypeError(
+ "cannot subtract {dtype}-dtype from {cls}".format(
+ dtype=other.dtype, cls=type(self).__name__
+ )
+ )
elif is_extension_array_dtype(other):
# Categorical op will raise; defer explicitly
return NotImplemented
@@ -1298,6 +1323,7 @@ def __sub__(self, other):
if is_timedelta64_dtype(result) and isinstance(result, np.ndarray):
from pandas.core.arrays import TimedeltaArray
+
# TODO: infer freq?
return TimedeltaArray(result)
return result
@@ -1309,20 +1335,28 @@ def __rsub__(self, other):
if not isinstance(other, DatetimeLikeArrayMixin):
# Avoid down-casting DatetimeIndex
from pandas.core.arrays import DatetimeArray
+
other = DatetimeArray(other)
return other - self
- elif (is_datetime64_any_dtype(self) and hasattr(other, 'dtype') and
- not is_datetime64_any_dtype(other)):
+ elif (
+ is_datetime64_any_dtype(self)
+ and hasattr(other, "dtype")
+ and not is_datetime64_any_dtype(other)
+ ):
# GH#19959 datetime - datetime is well-defined as timedelta,
# but any other type - datetime is not well-defined.
- raise TypeError("cannot subtract {cls} from {typ}"
- .format(cls=type(self).__name__,
- typ=type(other).__name__))
+ raise TypeError(
+ "cannot subtract {cls} from {typ}".format(
+ cls=type(self).__name__, typ=type(other).__name__
+ )
+ )
elif is_period_dtype(self) and is_timedelta64_dtype(other):
# TODO: Can we simplify/generalize these cases at all?
- raise TypeError("cannot subtract {cls} from {dtype}"
- .format(cls=type(self).__name__,
- dtype=other.dtype))
+ raise TypeError(
+ "cannot subtract {cls} from {dtype}".format(
+ cls=type(self).__name__, dtype=other.dtype
+ )
+ )
return -(self - other)
# FIXME: DTA/TDA/PA inplace methods should actually be inplace, GH#24115
@@ -1337,8 +1371,9 @@ def __isub__(self, other):
# --------------------------------------------------------------
# Comparison Methods
- def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise',
- from_utc=False):
+ def _ensure_localized(
+ self, arg, ambiguous="raise", nonexistent="raise", from_utc=False
+ ):
"""
Ensure that we are re-localized.
@@ -1360,12 +1395,12 @@ def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise',
"""
# reconvert to local tz
- tz = getattr(self, 'tz', None)
+ tz = getattr(self, "tz", None)
if tz is not None:
if not isinstance(arg, type(self)):
arg = self._simple_new(arg)
if from_utc:
- arg = arg.tz_localize('UTC').tz_convert(self.tz)
+ arg = arg.tz_localize("UTC").tz_convert(self.tz)
else:
arg = arg.tz_localize(
self.tz, ambiguous=ambiguous, nonexistent=nonexistent
@@ -1463,8 +1498,8 @@ def mean(self, skipna=True):
raise TypeError(
"mean is not implemented for {cls} since the meaning is "
"ambiguous. An alternative is "
- "obj.to_timestamp(how='start').mean()"
- .format(cls=type(self).__name__))
+ "obj.to_timestamp(how='start').mean()".format(cls=type(self).__name__)
+ )
mask = self.isna()
if skipna:
@@ -1478,7 +1513,7 @@ def mean(self, skipna=True):
# short-circut for empty max / min
return NaT
- result = nanops.nanmean(values.view('i8'), skipna=skipna)
+ result = nanops.nanmean(values.view("i8"), skipna=skipna)
# Don't have to worry about NA `result`, since no NA went in.
return self._box_func(result)
@@ -1486,6 +1521,7 @@ def mean(self, skipna=True):
# -------------------------------------------------------------------
# Shared Constructor Helpers
+
def validate_periods(periods):
"""
If a `periods` argument is passed to the Datetime/Timedelta Array/Index
@@ -1508,8 +1544,9 @@ def validate_periods(periods):
if lib.is_float(periods):
periods = int(periods)
elif not lib.is_integer(periods):
- raise TypeError('periods must be a number, got {periods}'
- .format(periods=periods))
+ raise TypeError(
+ "periods must be a number, got {periods}".format(periods=periods)
+ )
return periods
@@ -1569,11 +1606,11 @@ def validate_inferred_freq(freq, inferred_freq, freq_infer):
"""
if inferred_freq is not None:
if freq is not None and freq != inferred_freq:
- raise ValueError('Inferred frequency {inferred} from passed '
- 'values does not conform to passed frequency '
- '{passed}'
- .format(inferred=inferred_freq,
- passed=freq.freqstr))
+ raise ValueError(
+ "Inferred frequency {inferred} from passed "
+ "values does not conform to passed frequency "
+ "{passed}".format(inferred=inferred_freq, passed=freq.freqstr)
+ )
elif freq is None:
freq = inferred_freq
freq_infer = False
@@ -1600,7 +1637,7 @@ def maybe_infer_freq(freq):
freq_infer = False
if not isinstance(freq, DateOffset):
# if a passed freq is None, don't infer automatically
- if freq != 'infer':
+ if freq != "infer":
freq = frequencies.to_offset(freq)
else:
freq_infer = True
@@ -1628,17 +1665,16 @@ def _ensure_datetimelike_to_i8(other, to_utc=False):
if lib.is_scalar(other) and isna(other):
return iNaT
- elif isinstance(other, (PeriodArray, ABCIndexClass,
- DatetimeLikeArrayMixin)):
+ elif isinstance(other, (PeriodArray, ABCIndexClass, DatetimeLikeArrayMixin)):
# convert tz if needed
- if getattr(other, 'tz', None) is not None:
+ if getattr(other, "tz", None) is not None:
if to_utc:
- other = other.tz_convert('UTC')
+ other = other.tz_convert("UTC")
else:
other = other.tz_localize(None)
else:
try:
- return np.array(other, copy=False).view('i8')
+ return np.array(other, copy=False).view("i8")
except TypeError:
# period array cannot be coerced to int
other = Index(other)
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index 6b554ddf25c96..5b540dcce53c8 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -8,20 +8,44 @@
from pandas._libs import lib, tslib
from pandas._libs.tslibs import (
- NaT, Timestamp, ccalendar, conversion, fields, iNaT, normalize_date,
- resolution as libresolution, timezones, tzconversion)
+ NaT,
+ Timestamp,
+ ccalendar,
+ conversion,
+ fields,
+ iNaT,
+ normalize_date,
+ resolution as libresolution,
+ timezones,
+ tzconversion,
+)
import pandas.compat as compat
from pandas.errors import PerformanceWarning
from pandas.util._decorators import Appender
from pandas.core.dtypes.common import (
- _INT64_DTYPE, _NS_DTYPE, is_categorical_dtype, is_datetime64_dtype,
- is_datetime64_ns_dtype, is_datetime64tz_dtype, is_dtype_equal,
- is_extension_type, is_float_dtype, is_object_dtype, is_period_dtype,
- is_string_dtype, is_timedelta64_dtype, pandas_dtype)
+ _INT64_DTYPE,
+ _NS_DTYPE,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetime64_ns_dtype,
+ is_datetime64tz_dtype,
+ is_dtype_equal,
+ is_extension_type,
+ is_float_dtype,
+ is_object_dtype,
+ is_period_dtype,
+ is_string_dtype,
+ is_timedelta64_dtype,
+ pandas_dtype,
+)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCIndexClass, ABCPandasArray, ABCSeries)
+ ABCDataFrame,
+ ABCIndexClass,
+ ABCPandasArray,
+ ABCSeries,
+)
from pandas.core.dtypes.missing import isna
from pandas.core import ops
@@ -92,15 +116,16 @@ def f(self):
values = self._local_timestamps()
if field in self._bool_ops:
- if field.endswith(('start', 'end')):
+ if field.endswith(("start", "end")):
freq = self.freq
month_kw = 12
if freq:
kwds = freq.kwds
- month_kw = kwds.get('startingMonth', kwds.get('month', 12))
+ month_kw = kwds.get("startingMonth", kwds.get("month", 12))
- result = fields.get_start_end_field(values, field,
- self.freqstr, month_kw)
+ result = fields.get_start_end_field(
+ values, field, self.freqstr, month_kw
+ )
else:
result = fields.get_date_field(values, field)
@@ -113,8 +138,9 @@ def f(self):
else:
result = fields.get_date_field(values, field)
- result = self._maybe_mask_results(result, fill_value=None,
- convert='float64')
+ result = self._maybe_mask_results(
+ result, fill_value=None, convert="float64"
+ )
return result
@@ -127,8 +153,8 @@ def _dt_array_cmp(cls, op):
"""
Wrap comparison operations to convert datetime-like to datetime64
"""
- opname = '__{name}__'.format(name=op.__name__)
- nat_result = opname == '__ne__'
+ opname = "__{name}__".format(name=op.__name__)
+ nat_result = opname == "__ne__"
def wrapper(self, other):
if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
@@ -147,7 +173,7 @@ def wrapper(self, other):
# string that cannot be parsed to Timestamp
return ops.invalid_comparison(self, other, op)
- result = op(self.asi8, other.view('i8'))
+ result = op(self.asi8, other.view("i8"))
if isna(other):
result.fill(nat_result)
elif lib.is_scalar(other) or np.ndim(other) == 0:
@@ -160,8 +186,9 @@ def wrapper(self, other):
other = type(self)._from_sequence(other)
except ValueError:
other = np.array(other, dtype=np.object_)
- elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries,
- DatetimeArray)):
+ elif not isinstance(
+ other, (np.ndarray, ABCIndexClass, ABCSeries, DatetimeArray)
+ ):
# Following Timestamp convention, __eq__ is all-False
# and __ne__ is all True, others raise TypeError.
return ops.invalid_comparison(self, other, op)
@@ -170,13 +197,12 @@ def wrapper(self, other):
# We have to use _comp_method_OBJECT_ARRAY instead of numpy
# comparison otherwise it would fail to raise when
# comparing tz-aware and tz-naive
- with np.errstate(all='ignore'):
- result = ops._comp_method_OBJECT_ARRAY(op,
- self.astype(object),
- other)
+ with np.errstate(all="ignore"):
+ result = ops._comp_method_OBJECT_ARRAY(
+ op, self.astype(object), other
+ )
o_mask = isna(other)
- elif not (is_datetime64_dtype(other) or
- is_datetime64tz_dtype(other)):
+ elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)):
# e.g. is_timedelta64_dtype(other)
return ops.invalid_comparison(self, other, op)
else:
@@ -184,14 +210,16 @@ def wrapper(self, other):
if isinstance(other, (ABCIndexClass, ABCSeries)):
other = other.array
- if (is_datetime64_dtype(other) and
- not is_datetime64_ns_dtype(other) or
- not hasattr(other, 'asi8')):
+ if (
+ is_datetime64_dtype(other)
+ and not is_datetime64_ns_dtype(other)
+ or not hasattr(other, "asi8")
+ ):
# e.g. other.dtype == 'datetime64[s]'
# or an object-dtype ndarray
other = type(self)._from_sequence(other)
- result = op(self.view('i8'), other.view('i8'))
+ result = op(self.view("i8"), other.view("i8"))
o_mask = other._isnan
result = com.values_from_object(result)
@@ -207,9 +235,7 @@ def wrapper(self, other):
return compat.set_function_name(wrapper, opname, cls)
-class DatetimeArray(dtl.DatetimeLikeArrayMixin,
- dtl.TimelikeOps,
- dtl.DatelikeOps):
+class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps):
"""
Pandas ExtensionArray for tz-naive or tz-aware datetime data.
@@ -245,25 +271,53 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin,
-------
None
"""
+
_typ = "datetimearray"
_scalar_type = Timestamp
# define my properties & methods for delegation
- _bool_ops = ['is_month_start', 'is_month_end',
- 'is_quarter_start', 'is_quarter_end', 'is_year_start',
- 'is_year_end', 'is_leap_year']
- _object_ops = ['weekday_name', 'freq', 'tz']
- _field_ops = ['year', 'month', 'day', 'hour', 'minute', 'second',
- 'weekofyear', 'week', 'weekday', 'dayofweek',
- 'dayofyear', 'quarter', 'days_in_month',
- 'daysinmonth', 'microsecond',
- 'nanosecond']
- _other_ops = ['date', 'time', 'timetz']
+ _bool_ops = [
+ "is_month_start",
+ "is_month_end",
+ "is_quarter_start",
+ "is_quarter_end",
+ "is_year_start",
+ "is_year_end",
+ "is_leap_year",
+ ]
+ _object_ops = ["weekday_name", "freq", "tz"]
+ _field_ops = [
+ "year",
+ "month",
+ "day",
+ "hour",
+ "minute",
+ "second",
+ "weekofyear",
+ "week",
+ "weekday",
+ "dayofweek",
+ "dayofyear",
+ "quarter",
+ "days_in_month",
+ "daysinmonth",
+ "microsecond",
+ "nanosecond",
+ ]
+ _other_ops = ["date", "time", "timetz"]
_datetimelike_ops = _field_ops + _object_ops + _bool_ops + _other_ops
- _datetimelike_methods = ['to_period', 'tz_localize',
- 'tz_convert',
- 'normalize', 'strftime', 'round', 'floor',
- 'ceil', 'month_name', 'day_name']
+ _datetimelike_methods = [
+ "to_period",
+ "tz_localize",
+ "tz_convert",
+ "normalize",
+ "strftime",
+ "round",
+ "floor",
+ "ceil",
+ "month_name",
+ "day_name",
+ ]
# ndim is inherited from ExtensionArray, must exist to ensure
# Timestamp.__richcmp__(DateTimeArray) operates pointwise
@@ -286,7 +340,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
if isinstance(values, type(self)):
# validation
- dtz = getattr(dtype, 'tz', None)
+ dtz = getattr(dtype, "tz", None)
if dtz and values.tz is None:
dtype = DatetimeTZDtype(tz=dtype.tz)
elif dtz and values.tz:
@@ -312,7 +366,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
if values.ndim != 1:
raise ValueError("Only 1-dimensional input arrays are supported.")
- if values.dtype == 'i8':
+ if values.dtype == "i8":
# for compat with datetime/timedelta/period shared methods,
# we can sometimes get here with int64 values. These represent
# nanosecond UTC (or tz-naive) unix timestamps
@@ -338,7 +392,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
values = values.copy()
if freq:
freq = to_offset(freq)
- if getattr(dtype, 'tz', None):
+ if getattr(dtype, "tz", None):
# https://github.com/pandas-dev/pandas/issues/18595
# Ensure that we have a standard timezone for pytz objects.
# Without this, things like adding an array of timedeltas and
@@ -356,7 +410,7 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False):
@classmethod
def _simple_new(cls, values, freq=None, dtype=_NS_DTYPE):
assert isinstance(values, np.ndarray)
- if values.dtype == 'i8':
+ if values.dtype == "i8":
values = values.view(_NS_DTYPE)
result = object.__new__(cls)
@@ -366,20 +420,33 @@ def _simple_new(cls, values, freq=None, dtype=_NS_DTYPE):
return result
@classmethod
- def _from_sequence(cls, data, dtype=None, copy=False,
- tz=None, freq=None,
- dayfirst=False, yearfirst=False, ambiguous='raise',
- int_as_wall_time=False):
+ def _from_sequence(
+ cls,
+ data,
+ dtype=None,
+ copy=False,
+ tz=None,
+ freq=None,
+ dayfirst=False,
+ yearfirst=False,
+ ambiguous="raise",
+ int_as_wall_time=False,
+ ):
freq, freq_infer = dtl.maybe_infer_freq(freq)
subarr, tz, inferred_freq = sequence_to_dt64ns(
- data, dtype=dtype, copy=copy, tz=tz,
- dayfirst=dayfirst, yearfirst=yearfirst,
- ambiguous=ambiguous, int_as_wall_time=int_as_wall_time)
+ data,
+ dtype=dtype,
+ copy=copy,
+ tz=tz,
+ dayfirst=dayfirst,
+ yearfirst=yearfirst,
+ ambiguous=ambiguous,
+ int_as_wall_time=int_as_wall_time,
+ )
- freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq,
- freq_infer)
+ freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer)
dtype = tz_to_dtype(tz)
result = cls._simple_new(subarr, freq=freq, dtype=dtype)
@@ -396,18 +463,28 @@ def _from_sequence(cls, data, dtype=None, copy=False,
return result
@classmethod
- def _generate_range(cls, start, end, periods, freq, tz=None,
- normalize=False, ambiguous='raise',
- nonexistent='raise', closed=None):
+ def _generate_range(
+ cls,
+ start,
+ end,
+ periods,
+ freq,
+ tz=None,
+ normalize=False,
+ ambiguous="raise",
+ nonexistent="raise",
+ closed=None,
+ ):
periods = dtl.validate_periods(periods)
if freq is None and any(x is None for x in [periods, start, end]):
- raise ValueError('Must provide freq argument if no data is '
- 'supplied')
+ raise ValueError("Must provide freq argument if no data is " "supplied")
if com.count_not_none(start, end, periods, freq) != 3:
- raise ValueError('Of the four parameters: start, end, periods, '
- 'and freq, exactly three must be specified')
+ raise ValueError(
+ "Of the four parameters: start, end, periods, "
+ "and freq, exactly three must be specified"
+ )
freq = to_offset(freq)
if start is not None:
@@ -418,27 +495,31 @@ def _generate_range(cls, start, end, periods, freq, tz=None,
if start is None and end is None:
if closed is not None:
- raise ValueError("Closed has to be None if not both of start"
- "and end are defined")
+ raise ValueError(
+ "Closed has to be None if not both of start" "and end are defined"
+ )
if start is NaT or end is NaT:
raise ValueError("Neither `start` nor `end` can be NaT")
left_closed, right_closed = dtl.validate_endpoints(closed)
- start, end, _normalized = _maybe_normalize_endpoints(start, end,
- normalize)
+ start, end, _normalized = _maybe_normalize_endpoints(start, end, normalize)
tz = _infer_tz_from_endpoints(start, end, tz)
if tz is not None:
# Localize the start and end arguments
start = _maybe_localize_point(
- start, getattr(start, 'tz', None), start, freq, tz,
- ambiguous, nonexistent
+ start,
+ getattr(start, "tz", None),
+ start,
+ freq,
+ tz,
+ ambiguous,
+ nonexistent,
)
end = _maybe_localize_point(
- end, getattr(end, 'tz', None), end, freq, tz,
- ambiguous, nonexistent
+ end, getattr(end, "tz", None), end, freq, tz, ambiguous, nonexistent
)
if freq is not None:
# We break Day arithmetic (fixed 24 hour) here and opt for
@@ -455,8 +536,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None,
if tz is not None and index.tz is None:
arr = conversion.tz_localize_to_utc(
- index.asi8,
- tz, ambiguous=ambiguous, nonexistent=nonexistent)
+ index.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent
+ )
index = cls(arr)
@@ -471,12 +552,13 @@ def _generate_range(cls, start, end, periods, freq, tz=None,
# Nanosecond-granularity timestamps aren't always correctly
# representable with doubles, so we limit the range that we
# pass to np.linspace as much as possible
- arr = np.linspace(
- 0, end.value - start.value,
- periods, dtype='int64') + start.value
+ arr = (
+ np.linspace(0, end.value - start.value, periods, dtype="int64")
+ + start.value
+ )
dtype = tz_to_dtype(tz)
index = cls._simple_new(
- arr.astype('M8[ns]', copy=False), freq=None, dtype=dtype
+ arr.astype("M8[ns]", copy=False), freq=None, dtype=dtype
)
if not left_closed and len(index) and index[0] == start:
@@ -504,8 +586,11 @@ def _check_compatible_with(self, other):
if other is NaT:
return
if not timezones.tz_compare(self.tz, other.tz):
- raise ValueError("Timezones don't match. '{own} != {other}'"
- .format(own=self.tz, other=other.tz))
+ raise ValueError(
+ "Timezones don't match. '{own} != {other}'".format(
+ own=self.tz, other=other.tz
+ )
+ )
def _maybe_clear_freq(self):
self._freq = None
@@ -555,8 +640,10 @@ def tz(self):
@tz.setter
def tz(self, value):
# GH 3746: Prevent localizing or converting the index by setting tz
- raise AttributeError("Cannot directly set timezone. Use tz_localize() "
- "or tz_convert() as appropriate")
+ raise AttributeError(
+ "Cannot directly set timezone. Use tz_localize() "
+ "or tz_convert() as appropriate"
+ )
@property
def tzinfo(self):
@@ -610,9 +697,9 @@ def __iter__(self):
for i in range(chunks):
start_i = i * chunksize
end_i = min((i + 1) * chunksize, length)
- converted = tslib.ints_to_pydatetime(data[start_i:end_i],
- tz=self.tz, freq=self.freq,
- box="timestamp")
+ converted = tslib.ints_to_pydatetime(
+ data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp"
+ )
for v in converted:
yield v
@@ -623,11 +710,10 @@ def astype(self, dtype, copy=True):
# DatetimeLikeArrayMixin Super handles the rest.
dtype = pandas_dtype(dtype)
- if (is_datetime64_ns_dtype(dtype) and
- not is_dtype_equal(dtype, self.dtype)):
+ if is_datetime64_ns_dtype(dtype) and not is_dtype_equal(dtype, self.dtype):
# GH#18951: datetime64_ns dtype but not equal means different tz
- new_tz = getattr(dtype, 'tz', None)
- if getattr(self.dtype, 'tz', None) is None:
+ new_tz = getattr(dtype, "tz", None)
+ if getattr(self.dtype, "tz", None) is None:
return self.tz_localize(new_tz)
result = self.tz_convert(new_tz)
if new_tz is None:
@@ -636,8 +722,7 @@ def astype(self, dtype, copy=True):
# ndarray, but we could maybe work around it there.
result = result._data
return result
- elif is_datetime64tz_dtype(self.dtype) and is_dtype_equal(self.dtype,
- dtype):
+ elif is_datetime64tz_dtype(self.dtype) and is_dtype_equal(self.dtype, dtype):
if copy:
return self.copy()
return self
@@ -656,21 +741,23 @@ def _validate_fill_value(self, fill_value):
self._assert_tzawareness_compat(fill_value)
fill_value = Timestamp(fill_value).value
else:
- raise ValueError("'fill_value' should be a Timestamp. "
- "Got '{got}'.".format(got=fill_value))
+ raise ValueError(
+ "'fill_value' should be a Timestamp. "
+ "Got '{got}'.".format(got=fill_value)
+ )
return fill_value
# -----------------------------------------------------------------
# Rendering Methods
- def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs):
+ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs):
from pandas.io.formats.format import _get_format_datetime64_from_values
+
fmt = _get_format_datetime64_from_values(self, date_format)
- return tslib.format_array_from_datetime(self.asi8,
- tz=self.tz,
- format=fmt,
- na_rep=na_rep)
+ return tslib.format_array_from_datetime(
+ self.asi8, tz=self.tz, format=fmt, na_rep=na_rep
+ )
# -----------------------------------------------------------------
# Comparison Methods
@@ -684,12 +771,12 @@ def _has_same_tz(self, other):
if isinstance(other, np.datetime64):
# convert to Timestamp as np.datetime64 doesn't have tz attr
other = Timestamp(other)
- vzone = timezones.get_timezone(getattr(other, 'tzinfo', '__no_tz__'))
+ vzone = timezones.get_timezone(getattr(other, "tzinfo", "__no_tz__"))
return zzone == vzone
def _assert_tzawareness_compat(self, other):
# adapted from _Timestamp._assert_tzawareness_compat
- other_tz = getattr(other, 'tzinfo', None)
+ other_tz = getattr(other, "tzinfo", None)
if is_datetime64tz_dtype(other):
# Get tzinfo from Series dtype
other_tz = other.dtype.tz
@@ -698,11 +785,13 @@ def _assert_tzawareness_compat(self, other):
pass
elif self.tz is None:
if other_tz is not None:
- raise TypeError('Cannot compare tz-naive and tz-aware '
- 'datetime-like objects.')
+ raise TypeError(
+ "Cannot compare tz-naive and tz-aware " "datetime-like objects."
+ )
elif other_tz is None:
- raise TypeError('Cannot compare tz-naive and tz-aware '
- 'datetime-like objects')
+ raise TypeError(
+ "Cannot compare tz-naive and tz-aware " "datetime-like objects"
+ )
# -----------------------------------------------------------------
# Arithmetic Methods
@@ -718,18 +807,18 @@ def _sub_datetime_arraylike(self, other):
if not self._has_same_tz(other):
# require tz compat
- raise TypeError("{cls} subtraction must have the same "
- "timezones or no timezones"
- .format(cls=type(self).__name__))
+ raise TypeError(
+ "{cls} subtraction must have the same "
+ "timezones or no timezones".format(cls=type(self).__name__)
+ )
self_i8 = self.asi8
other_i8 = other.asi8
arr_mask = self._isnan | other._isnan
- new_values = checked_add_with_arr(self_i8, -other_i8,
- arr_mask=arr_mask)
+ new_values = checked_add_with_arr(self_i8, -other_i8, arr_mask=arr_mask)
if self._hasnans or other._hasnans:
new_values[arr_mask] = iNaT
- return new_values.view('timedelta64[ns]')
+ return new_values.view("timedelta64[ns]")
def _add_offset(self, offset):
assert not isinstance(offset, Tick)
@@ -743,11 +832,13 @@ def _add_offset(self, offset):
result = result.tz_localize(self.tz)
except NotImplementedError:
- warnings.warn("Non-vectorized DateOffset being applied to Series "
- "or DatetimeIndex", PerformanceWarning)
- result = self.astype('O') + offset
+ warnings.warn(
+ "Non-vectorized DateOffset being applied to Series " "or DatetimeIndex",
+ PerformanceWarning,
+ )
+ result = self.astype("O") + offset
- return type(self)._from_sequence(result, freq='infer')
+ return type(self)._from_sequence(result, freq="infer")
def _sub_datetimelike_scalar(self, other):
# subtract a datetime from myself, yielding a ndarray[timedelta64[ns]]
@@ -759,14 +850,14 @@ def _sub_datetimelike_scalar(self, other):
if not self._has_same_tz(other):
# require tz compat
- raise TypeError("Timestamp subtraction must have the same "
- "timezones or no timezones")
+ raise TypeError(
+ "Timestamp subtraction must have the same " "timezones or no timezones"
+ )
i8 = self.asi8
- result = checked_add_with_arr(i8, -other.value,
- arr_mask=self._isnan)
+ result = checked_add_with_arr(i8, -other.value, arr_mask=self._isnan)
result = self._maybe_mask_results(result)
- return result.view('timedelta64[ns]')
+ return result.view("timedelta64[ns]")
def _add_delta(self, delta):
"""
@@ -783,7 +874,7 @@ def _add_delta(self, delta):
result : DatetimeArray
"""
new_values = super()._add_delta(delta)
- return type(self)._from_sequence(new_values, tz=self.tz, freq='infer')
+ return type(self)._from_sequence(new_values, tz=self.tz, freq="infer")
# -----------------------------------------------------------------
# Timezone Conversion and Localization Methods
@@ -865,15 +956,15 @@ def tz_convert(self, tz):
if self.tz is None:
# tz naive, use tz_localize
- raise TypeError('Cannot convert tz-naive timestamps, use '
- 'tz_localize to localize')
+ raise TypeError(
+ "Cannot convert tz-naive timestamps, use " "tz_localize to localize"
+ )
# No conversion since timestamps are all UTC to begin with
dtype = tz_to_dtype(tz)
return self._simple_new(self.asi8, dtype=dtype, freq=self.freq)
- def tz_localize(self, tz, ambiguous='raise', nonexistent='raise',
- errors=None):
+ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None):
"""
Localize tz-naive Datetime Array/Index to tz-aware
Datetime Array/Index.
@@ -1021,30 +1112,35 @@ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise',
dtype: datetime64[ns, 'Europe/Warsaw']
"""
if errors is not None:
- warnings.warn("The errors argument is deprecated and will be "
- "removed in a future release. Use "
- "nonexistent='NaT' or nonexistent='raise' "
- "instead.", FutureWarning)
- if errors == 'coerce':
- nonexistent = 'NaT'
- elif errors == 'raise':
- nonexistent = 'raise'
+ warnings.warn(
+ "The errors argument is deprecated and will be "
+ "removed in a future release. Use "
+ "nonexistent='NaT' or nonexistent='raise' "
+ "instead.",
+ FutureWarning,
+ )
+ if errors == "coerce":
+ nonexistent = "NaT"
+ elif errors == "raise":
+ nonexistent = "raise"
else:
- raise ValueError("The errors argument must be either 'coerce' "
- "or 'raise'.")
+ raise ValueError(
+ "The errors argument must be either 'coerce' " "or 'raise'."
+ )
- nonexistent_options = ('raise', 'NaT', 'shift_forward',
- 'shift_backward')
+ nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")
if nonexistent not in nonexistent_options and not isinstance(
- nonexistent, timedelta):
- raise ValueError("The nonexistent argument must be one of 'raise',"
- " 'NaT', 'shift_forward', 'shift_backward' or"
- " a timedelta object")
+ nonexistent, timedelta
+ ):
+ raise ValueError(
+ "The nonexistent argument must be one of 'raise',"
+ " 'NaT', 'shift_forward', 'shift_backward' or"
+ " a timedelta object"
+ )
if self.tz is not None:
if tz is None:
- new_dates = tzconversion.tz_convert(self.asi8, timezones.UTC,
- self.tz)
+ new_dates = tzconversion.tz_convert(self.asi8, timezones.UTC, self.tz)
else:
raise TypeError("Already tz-aware, use tz_convert to convert.")
else:
@@ -1052,7 +1148,7 @@ def tz_localize(self, tz, ambiguous='raise', nonexistent='raise',
# Convert to UTC
new_dates = conversion.tz_localize_to_utc(
- self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent,
+ self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent
)
new_dates = new_dates.view(_NS_DTYPE)
dtype = tz_to_dtype(tz)
@@ -1114,12 +1210,11 @@ def normalize(self):
not_null = ~self.isna()
DAY_NS = ccalendar.DAY_SECONDS * 1000000000
new_values = self.asi8.copy()
- adjustment = (new_values[not_null] % DAY_NS)
+ adjustment = new_values[not_null] % DAY_NS
new_values[not_null] = new_values[not_null] - adjustment
else:
new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz)
- return type(self)._from_sequence(new_values,
- freq='infer').tz_localize(self.tz)
+ return type(self)._from_sequence(new_values, freq="infer").tz_localize(self.tz)
def to_period(self, freq=None):
"""
@@ -1168,15 +1263,19 @@ def to_period(self, freq=None):
from pandas.core.arrays import PeriodArray
if self.tz is not None:
- warnings.warn("Converting to PeriodArray/Index representation "
- "will drop timezone information.", UserWarning)
+ warnings.warn(
+ "Converting to PeriodArray/Index representation "
+ "will drop timezone information.",
+ UserWarning,
+ )
if freq is None:
freq = self.freqstr or self.inferred_freq
if freq is None:
- raise ValueError("You must pass a freq argument as "
- "current index has none.")
+ raise ValueError(
+ "You must pass a freq argument as " "current index has none."
+ )
freq = get_period_alias(freq)
@@ -1198,8 +1297,9 @@ def to_perioddelta(self, freq):
"""
# TODO: consider privatizing (discussion in GH#23113)
from pandas.core.arrays.timedeltas import TimedeltaArray
+
i8delta = self.asi8 - self.to_period(freq).to_timestamp().asi8
- m8delta = i8delta.view('m8[ns]')
+ m8delta = i8delta.view("m8[ns]")
return TimedeltaArray(m8delta)
# -----------------------------------------------------------------
@@ -1236,8 +1336,7 @@ def month_name(self, locale=None):
else:
values = self.asi8
- result = fields.get_date_name_field(values, 'month_name',
- locale=locale)
+ result = fields.get_date_name_field(values, "month_name", locale=locale)
result = self._maybe_mask_results(result, fill_value=None)
return result
@@ -1272,8 +1371,7 @@ def day_name(self, locale=None):
else:
values = self.asi8
- result = fields.get_date_name_field(values, 'day_name',
- locale=locale)
+ result = fields.get_date_name_field(values, "day_name", locale=locale)
result = self._maybe_mask_results(result, fill_value=None)
return result
@@ -1316,19 +1414,17 @@ def date(self):
return tslib.ints_to_pydatetime(timestamps, box="date")
- year = _field_accessor('year', 'Y', "The year of the datetime.")
- month = _field_accessor('month', 'M',
- "The month as January=1, December=12. ")
- day = _field_accessor('day', 'D', "The days of the datetime.")
- hour = _field_accessor('hour', 'h', "The hours of the datetime.")
- minute = _field_accessor('minute', 'm', "The minutes of the datetime.")
- second = _field_accessor('second', 's', "The seconds of the datetime.")
- microsecond = _field_accessor('microsecond', 'us',
- "The microseconds of the datetime.")
- nanosecond = _field_accessor('nanosecond', 'ns',
- "The nanoseconds of the datetime.")
- weekofyear = _field_accessor('weekofyear', 'woy',
- "The week ordinal of the year.")
+ year = _field_accessor("year", "Y", "The year of the datetime.")
+ month = _field_accessor("month", "M", "The month as January=1, December=12. ")
+ day = _field_accessor("day", "D", "The days of the datetime.")
+ hour = _field_accessor("hour", "h", "The hours of the datetime.")
+ minute = _field_accessor("minute", "m", "The minutes of the datetime.")
+ second = _field_accessor("second", "s", "The seconds of the datetime.")
+ microsecond = _field_accessor(
+ "microsecond", "us", "The microseconds of the datetime."
+ )
+ nanosecond = _field_accessor("nanosecond", "ns", "The nanoseconds of the datetime.")
+ weekofyear = _field_accessor("weekofyear", "woy", "The week ordinal of the year.")
week = weekofyear
_dayofweek_doc = """
The day of the week with Monday=0, Sunday=6.
@@ -1364,21 +1460,20 @@ def date(self):
2017-01-08 6
Freq: D, dtype: int64
"""
- dayofweek = _field_accessor('dayofweek', 'dow', _dayofweek_doc)
+ dayofweek = _field_accessor("dayofweek", "dow", _dayofweek_doc)
weekday = dayofweek
weekday_name = _field_accessor(
- 'weekday_name',
- 'weekday_name',
- "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0")
+ "weekday_name",
+ "weekday_name",
+ "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0",
+ )
- dayofyear = _field_accessor('dayofyear', 'doy',
- "The ordinal day of the year.")
- quarter = _field_accessor('quarter', 'q', "The quarter of the date.")
+ dayofyear = _field_accessor("dayofyear", "doy", "The ordinal day of the year.")
+ quarter = _field_accessor("quarter", "q", "The quarter of the date.")
days_in_month = _field_accessor(
- 'days_in_month',
- 'dim',
- "The number of days in the month.")
+ "days_in_month", "dim", "The number of days in the month."
+ )
daysinmonth = days_in_month
_is_month_doc = """
Indicates whether the date is the {first_or_last} day of the month.
@@ -1425,18 +1520,16 @@ def date(self):
array([False, True, False])
"""
is_month_start = _field_accessor(
- 'is_month_start',
- 'is_month_start',
- _is_month_doc.format(first_or_last='first'))
+ "is_month_start", "is_month_start", _is_month_doc.format(first_or_last="first")
+ )
is_month_end = _field_accessor(
- 'is_month_end',
- 'is_month_end',
- _is_month_doc.format(first_or_last='last'))
+ "is_month_end", "is_month_end", _is_month_doc.format(first_or_last="last")
+ )
is_quarter_start = _field_accessor(
- 'is_quarter_start',
- 'is_quarter_start',
+ "is_quarter_start",
+ "is_quarter_start",
"""
Indicator for whether the date is the first day of a quarter.
@@ -1474,10 +1567,11 @@ def date(self):
>>> idx.is_quarter_start
array([False, False, True, False])
- """)
+ """,
+ )
is_quarter_end = _field_accessor(
- 'is_quarter_end',
- 'is_quarter_end',
+ "is_quarter_end",
+ "is_quarter_end",
"""
Indicator for whether the date is the last day of a quarter.
@@ -1515,10 +1609,11 @@ def date(self):
>>> idx.is_quarter_end
array([False, True, False, False])
- """)
+ """,
+ )
is_year_start = _field_accessor(
- 'is_year_start',
- 'is_year_start',
+ "is_year_start",
+ "is_year_start",
"""
Indicate whether the date is the first day of a year.
@@ -1558,10 +1653,11 @@ def date(self):
>>> idx.is_year_start
array([False, False, True])
- """)
+ """,
+ )
is_year_end = _field_accessor(
- 'is_year_end',
- 'is_year_end',
+ "is_year_end",
+ "is_year_end",
"""
Indicate whether the date is the last day of the year.
@@ -1601,10 +1697,11 @@ def date(self):
>>> idx.is_year_end
array([False, True, False])
- """)
+ """,
+ )
is_leap_year = _field_accessor(
- 'is_leap_year',
- 'is_leap_year',
+ "is_leap_year",
+ "is_leap_year",
"""
Boolean indicator if the date belongs to a leap year.
@@ -1641,7 +1738,8 @@ def date(self):
1 False
2 False
dtype: bool
- """)
+ """,
+ )
def to_julian_date(self):
"""
@@ -1657,19 +1755,23 @@ def to_julian_date(self):
testarr = month < 3
year[testarr] -= 1
month[testarr] += 12
- return (day +
- np.fix((153 * month - 457) / 5) +
- 365 * year +
- np.floor(year / 4) -
- np.floor(year / 100) +
- np.floor(year / 400) +
- 1721118.5 +
- (self.hour +
- self.minute / 60.0 +
- self.second / 3600.0 +
- self.microsecond / 3600.0 / 1e+6 +
- self.nanosecond / 3600.0 / 1e+9
- ) / 24.0)
+ return (
+ day
+ + np.fix((153 * month - 457) / 5)
+ + 365 * year
+ + np.floor(year / 4)
+ - np.floor(year / 100)
+ + np.floor(year / 400)
+ + 1721118.5
+ + (
+ self.hour
+ + self.minute / 60.0
+ + self.second / 3600.0
+ + self.microsecond / 3600.0 / 1e6
+ + self.nanosecond / 3600.0 / 1e9
+ )
+ / 24.0
+ )
DatetimeArray._add_comparison_ops()
@@ -1678,10 +1780,17 @@ def to_julian_date(self):
# -------------------------------------------------------------------
# Constructor Helpers
-def sequence_to_dt64ns(data, dtype=None, copy=False,
- tz=None,
- dayfirst=False, yearfirst=False, ambiguous='raise',
- int_as_wall_time=False):
+
+def sequence_to_dt64ns(
+ data,
+ dtype=None,
+ copy=False,
+ tz=None,
+ dayfirst=False,
+ yearfirst=False,
+ ambiguous="raise",
+ int_as_wall_time=False,
+):
"""
Parameters
----------
@@ -1748,13 +1857,14 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
# TODO: We do not have tests specific to string-dtypes,
# also complex or categorical or other extension
copy = False
- if lib.infer_dtype(data, skipna=False) == 'integer':
+ if lib.infer_dtype(data, skipna=False) == "integer":
data = data.astype(np.int64)
else:
# data comes back here as either i8 to denote UTC timestamps
# or M8[ns] to denote wall times
data, inferred_tz = objects_to_datetime64ns(
- data, dayfirst=dayfirst, yearfirst=yearfirst)
+ data, dayfirst=dayfirst, yearfirst=yearfirst
+ )
tz = maybe_infer_tz(tz, inferred_tz)
# When a sequence of timestamp objects is passed, we always
# want to treat the (now i8-valued) data as UTC timestamps,
@@ -1777,8 +1887,9 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
if tz is not None:
# Convert tz-naive to UTC
tz = timezones.maybe_get_tz(tz)
- data = conversion.tz_localize_to_utc(data.view('i8'), tz,
- ambiguous=ambiguous)
+ data = conversion.tz_localize_to_utc(
+ data.view("i8"), tz, ambiguous=ambiguous
+ )
data = data.view(_NS_DTYPE)
assert data.dtype == _NS_DTYPE, data.dtype
@@ -1794,8 +1905,9 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
data = data.astype(np.int64, copy=False)
if int_as_wall_time and tz is not None and not timezones.is_utc(tz):
warnings.warn(_i8_message, FutureWarning, stacklevel=4)
- data = conversion.tz_localize_to_utc(data.view('i8'), tz,
- ambiguous=ambiguous)
+ data = conversion.tz_localize_to_utc(
+ data.view("i8"), tz, ambiguous=ambiguous
+ )
data = data.view(_NS_DTYPE)
result = data.view(_NS_DTYPE)
@@ -1804,7 +1916,7 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
result = result.copy()
assert isinstance(result, np.ndarray), type(result)
- assert result.dtype == 'M8[ns]', result.dtype
+ assert result.dtype == "M8[ns]", result.dtype
# We have to call this again after possibly inferring a tz above
validate_tz_from_dtype(dtype, tz)
@@ -1812,9 +1924,15 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
return result, tz, inferred_freq
-def objects_to_datetime64ns(data, dayfirst, yearfirst,
- utc=False, errors="raise",
- require_iso8601=False, allow_object=False):
+def objects_to_datetime64ns(
+ data,
+ dayfirst,
+ yearfirst,
+ utc=False,
+ errors="raise",
+ require_iso8601=False,
+ allow_object=False,
+):
"""
Convert data to array of timestamps.
@@ -1854,14 +1972,14 @@ def objects_to_datetime64ns(data, dayfirst, yearfirst,
utc=utc,
dayfirst=dayfirst,
yearfirst=yearfirst,
- require_iso8601=require_iso8601
+ require_iso8601=require_iso8601,
)
except ValueError as e:
try:
values, tz_parsed = conversion.datetime_to_datetime64(data)
# If tzaware, these values represent unix timestamps, so we
# return them as i8 to distinguish from wall times
- return values.view('i8'), tz_parsed
+ return values.view("i8"), tz_parsed
except (ValueError, TypeError):
raise e
@@ -1869,7 +1987,7 @@ def objects_to_datetime64ns(data, dayfirst, yearfirst,
# We can take a shortcut since the datetime64 numpy array
# is in UTC
# Return i8 values to denote unix timestamps
- return result.view('i8'), tz_parsed
+ return result.view("i8"), tz_parsed
elif is_datetime64_dtype(result):
# returning M8[ns] denotes wall-times; since tz is None
# the distinction is a thin one
@@ -1917,16 +2035,20 @@ def maybe_convert_dtype(data, copy):
# with integer dtypes. See discussion in GH#23675
elif is_timedelta64_dtype(data):
- warnings.warn("Passing timedelta64-dtype data is deprecated, will "
- "raise a TypeError in a future version",
- FutureWarning, stacklevel=5)
+ warnings.warn(
+ "Passing timedelta64-dtype data is deprecated, will "
+ "raise a TypeError in a future version",
+ FutureWarning,
+ stacklevel=5,
+ )
data = data.view(_NS_DTYPE)
elif is_period_dtype(data):
# Note: without explicitly raising here, PeriodIndex
# test_setops.test_join_does_not_recur fails
- raise TypeError("Passing PeriodDtype data is invalid. "
- "Use `data.to_timestamp()` instead")
+ raise TypeError(
+ "Passing PeriodDtype data is invalid. " "Use `data.to_timestamp()` instead"
+ )
elif is_categorical_dtype(data):
# GH#18664 preserve tz in going DTI->Categorical->DTI
@@ -1947,6 +2069,7 @@ def maybe_convert_dtype(data, copy):
# -------------------------------------------------------------------
# Validation and Inference
+
def maybe_infer_tz(tz, inferred_tz):
"""
If a timezone is inferred from data, check that it is compatible with
@@ -1970,9 +2093,10 @@ def maybe_infer_tz(tz, inferred_tz):
elif inferred_tz is None:
pass
elif not timezones.tz_compare(tz, inferred_tz):
- raise TypeError('data is already tz-aware {inferred_tz}, unable to '
- 'set specified tz: {tz}'
- .format(inferred_tz=inferred_tz, tz=tz))
+ raise TypeError(
+ "data is already tz-aware {inferred_tz}, unable to "
+ "set specified tz: {tz}".format(inferred_tz=inferred_tz, tz=tz)
+ )
return tz
@@ -2003,17 +2127,21 @@ def _validate_dt64_dtype(dtype):
if is_dtype_equal(dtype, np.dtype("M8")):
# no precision, warn
dtype = _NS_DTYPE
- msg = textwrap.dedent("""\
+ msg = textwrap.dedent(
+ """\
Passing in 'datetime64' dtype with no precision is deprecated
and will raise in a future version. Please pass in
- 'datetime64[ns]' instead.""")
+ 'datetime64[ns]' instead."""
+ )
warnings.warn(msg, FutureWarning, stacklevel=5)
- if ((isinstance(dtype, np.dtype) and dtype != _NS_DTYPE)
- or not isinstance(dtype, (np.dtype, DatetimeTZDtype))):
- raise ValueError("Unexpected value for 'dtype': '{dtype}'. "
- "Must be 'datetime64[ns]' or DatetimeTZDtype'."
- .format(dtype=dtype))
+ if (isinstance(dtype, np.dtype) and dtype != _NS_DTYPE) or not isinstance(
+ dtype, (np.dtype, DatetimeTZDtype)
+ ):
+ raise ValueError(
+ "Unexpected value for 'dtype': '{dtype}'. "
+ "Must be 'datetime64[ns]' or DatetimeTZDtype'.".format(dtype=dtype)
+ )
return dtype
@@ -2046,19 +2174,20 @@ def validate_tz_from_dtype(dtype, tz):
# but not by us. We *do* allow non-existent tz errors to
# go through
pass
- dtz = getattr(dtype, 'tz', None)
+ dtz = getattr(dtype, "tz", None)
if dtz is not None:
if tz is not None and not timezones.tz_compare(tz, dtz):
- raise ValueError("cannot supply both a tz and a dtype"
- " with a tz")
+ raise ValueError("cannot supply both a tz and a dtype" " with a tz")
tz = dtz
if tz is not None and is_datetime64_dtype(dtype):
# We also need to check for the case where the user passed a
# tz-naive dtype (i.e. datetime64[ns])
if tz is not None and not timezones.tz_compare(tz, dtz):
- raise ValueError("cannot supply both a tz and a "
- "timezone-naive dtype (i.e. datetime64[ns])")
+ raise ValueError(
+ "cannot supply both a tz and a "
+ "timezone-naive dtype (i.e. datetime64[ns])"
+ )
return tz
@@ -2086,16 +2215,16 @@ def _infer_tz_from_endpoints(start, end, tz):
try:
inferred_tz = timezones.infer_tzinfo(start, end)
except Exception:
- raise TypeError('Start and end cannot both be tz-aware with '
- 'different timezones')
+ raise TypeError(
+ "Start and end cannot both be tz-aware with " "different timezones"
+ )
inferred_tz = timezones.maybe_get_tz(inferred_tz)
tz = timezones.maybe_get_tz(tz)
if tz is not None and inferred_tz is not None:
if not timezones.tz_compare(inferred_tz, tz):
- raise AssertionError("Inferred time zone not equal to passed "
- "time zone")
+ raise AssertionError("Inferred time zone not equal to passed " "time zone")
elif inferred_tz is not None:
tz = inferred_tz
@@ -2123,8 +2252,7 @@ def _maybe_normalize_endpoints(start, end, normalize):
return start, end, _normalized
-def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous,
- nonexistent):
+def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent):
"""
Localize a start or end Timestamp to the timezone of the corresponding
start or end Timestamp
@@ -2149,10 +2277,9 @@ def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous,
if is_none is None and is_not_none is not None:
# Note: We can't ambiguous='infer' a singular ambiguous time; however,
# we have historically defaulted ambiguous=False
- ambiguous = ambiguous if ambiguous != 'infer' else False
- localize_args = {'ambiguous': ambiguous, 'nonexistent': nonexistent,
- 'tz': None}
+ ambiguous = ambiguous if ambiguous != "infer" else False
+ localize_args = {"ambiguous": ambiguous, "nonexistent": nonexistent, "tz": None}
if isinstance(freq, Tick) or freq is None:
- localize_args['tz'] = tz
+ localize_args["tz"] = tz
ts = ts.tz_localize(**localize_args)
return ts
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index 644c2f634240f..c999c4db232e6 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -12,8 +12,15 @@
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
- is_bool_dtype, is_float, is_float_dtype, is_integer, is_integer_dtype,
- is_list_like, is_object_dtype, is_scalar)
+ is_bool_dtype,
+ is_float,
+ is_float_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_list_like,
+ is_object_dtype,
+ is_scalar,
+)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna, notna
@@ -32,23 +39,23 @@ class _IntegerDtype(ExtensionDtype):
The attributes name & type are set when these subclasses are created.
"""
+
name = None # type: str
base = None
type = None # type: Type
na_value = np.nan
def __repr__(self):
- sign = 'U' if self.is_unsigned_integer else ''
- return "{sign}Int{size}Dtype()".format(sign=sign,
- size=8 * self.itemsize)
+ sign = "U" if self.is_unsigned_integer else ""
+ return "{sign}Int{size}Dtype()".format(sign=sign, size=8 * self.itemsize)
@cache_readonly
def is_signed_integer(self):
- return self.kind == 'i'
+ return self.kind == "i"
@cache_readonly
def is_unsigned_integer(self):
- return self.kind == 'u'
+ return self.kind == "u"
@property
def _is_numeric(self):
@@ -111,15 +118,18 @@ def safe_cast(values, dtype, copy):
"""
try:
- return values.astype(dtype, casting='safe', copy=copy)
+ return values.astype(dtype, casting="safe", copy=copy)
except TypeError:
casted = values.astype(dtype, copy=copy)
if (casted == values).all():
return casted
- raise TypeError("cannot safely cast non-equivalent {} to {}".format(
- values.dtype, np.dtype(dtype)))
+ raise TypeError(
+ "cannot safely cast non-equivalent {} to {}".format(
+ values.dtype, np.dtype(dtype)
+ )
+ )
def coerce_to_array(values, dtype, mask=None, copy=False):
@@ -139,13 +149,14 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
tuple of (values, mask)
"""
# if values is integer numpy array, preserve it's dtype
- if dtype is None and hasattr(values, 'dtype'):
+ if dtype is None and hasattr(values, "dtype"):
if is_integer_dtype(values.dtype):
dtype = values.dtype
if dtype is not None:
- if (isinstance(dtype, str) and
- (dtype.startswith("Int") or dtype.startswith("UInt"))):
+ if isinstance(dtype, str) and (
+ dtype.startswith("Int") or dtype.startswith("UInt")
+ ):
# Avoid DeprecationWarning from NumPy about np.dtype("Int64")
# https://github.com/numpy/numpy/pull/7476
dtype = dtype.lower()
@@ -169,20 +180,26 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
values = np.array(values, copy=copy)
if is_object_dtype(values):
inferred_type = lib.infer_dtype(values, skipna=True)
- if inferred_type == 'empty':
+ if inferred_type == "empty":
values = np.empty(len(values))
values.fill(np.nan)
- elif inferred_type not in ['floating', 'integer',
- 'mixed-integer', 'mixed-integer-float']:
- raise TypeError("{} cannot be converted to an IntegerDtype".format(
- values.dtype))
+ elif inferred_type not in [
+ "floating",
+ "integer",
+ "mixed-integer",
+ "mixed-integer-float",
+ ]:
+ raise TypeError(
+ "{} cannot be converted to an IntegerDtype".format(values.dtype)
+ )
elif is_bool_dtype(values) and is_integer_dtype(dtype):
values = np.array(values, dtype=int, copy=copy)
elif not (is_integer_dtype(values) or is_float_dtype(values)):
- raise TypeError("{} cannot be converted to an IntegerDtype".format(
- values.dtype))
+ raise TypeError(
+ "{} cannot be converted to an IntegerDtype".format(values.dtype)
+ )
if mask is None:
mask = isna(values)
@@ -196,7 +213,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
# infer dtype if needed
if dtype is None:
- dtype = np.dtype('int64')
+ dtype = np.dtype("int64")
else:
dtype = dtype.type
@@ -284,13 +301,16 @@ def dtype(self):
return _dtypes[str(self._data.dtype)]
def __init__(self, values, mask, copy=False):
- if not (isinstance(values, np.ndarray)
- and is_integer_dtype(values.dtype)):
- raise TypeError("values should be integer numpy array. Use "
- "the 'integer_array' function instead")
+ if not (isinstance(values, np.ndarray) and is_integer_dtype(values.dtype)):
+ raise TypeError(
+ "values should be integer numpy array. Use "
+ "the 'integer_array' function instead"
+ )
if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
- raise TypeError("mask should be boolean numpy array. Use "
- "the 'integer_array' function instead")
+ raise TypeError(
+ "mask should be boolean numpy array. Use "
+ "the 'integer_array' function instead"
+ )
if copy:
values = values.copy()
@@ -315,8 +335,9 @@ def _from_factorized(cls, values, original):
def _formatter(self, boxed=False):
def fmt(x):
if isna(x):
- return 'NaN'
+ return "NaN"
return str(x)
+
return fmt
def __getitem__(self, item):
@@ -350,10 +371,10 @@ def __array__(self, dtype=None):
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# For IntegerArray inputs, we apply the ufunc to ._data
# and mask the result.
- if method == 'reduce':
+ if method == "reduce":
# Not clear how to handle missing values in reductions. Raise.
raise NotImplementedError("The 'reduce' method is not supported.")
- out = kwargs.get('out', ())
+ out = kwargs.get("out", ())
for x in inputs + out:
if not isinstance(x, self._HANDLED_TYPES + (IntegerArray,)):
@@ -361,7 +382,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs)
+ self, ufunc, method, *inputs, **kwargs
+ )
if result is not NotImplemented:
return result
@@ -404,11 +426,11 @@ def take(self, indexer, allow_fill=False, fill_value=None):
# we always fill with 1 internally
# to avoid upcasting
data_fill_value = 1 if isna(fill_value) else fill_value
- result = take(self._data, indexer, fill_value=data_fill_value,
- allow_fill=allow_fill)
+ result = take(
+ self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill
+ )
- mask = take(self._mask, indexer, fill_value=True,
- allow_fill=allow_fill)
+ mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill)
# if we are filling
# we only fill where the indexer is null
@@ -545,9 +567,10 @@ def value_counts(self, dropna=True):
# appending to an Index *always* infers
# w/o passing the dtype
array = np.append(array, [self._mask.sum()])
- index = Index(np.concatenate(
- [index.values,
- np.array([np.nan], dtype=object)]), dtype=object)
+ index = Index(
+ np.concatenate([index.values, np.array([np.nan], dtype=object)]),
+ dtype=object,
+ )
return Series(array, index=index)
@@ -585,7 +608,7 @@ def cmp_method(self, other):
elif is_list_like(other):
other = np.asarray(other)
if other.ndim > 0 and len(self) != len(other):
- raise ValueError('Lengths must match to compare')
+ raise ValueError("Lengths must match to compare")
other = lib.item_from_zerodim(other)
@@ -593,7 +616,7 @@ def cmp_method(self, other):
# comparisons, this will raise in the future
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = op(self._data, other)
# nans propagate
@@ -602,10 +625,10 @@ def cmp_method(self, other):
else:
mask = self._mask | mask
- result[mask] = op_name == 'ne'
+ result[mask] = op_name == "ne"
return result
- name = '__{name}__'.format(name=op.__name__)
+ name = "__{name}__".format(name=op.__name__)
return set_function_name(cmp_method, name, cls)
def _reduce(self, name, skipna=True, **kwargs):
@@ -614,19 +637,19 @@ def _reduce(self, name, skipna=True, **kwargs):
# coerce to a nan-aware float if needed
if mask.any():
- data = self._data.astype('float64')
+ data = self._data.astype("float64")
data[mask] = self._na_value
- op = getattr(nanops, 'nan' + name)
+ op = getattr(nanops, "nan" + name)
result = op(data, axis=0, skipna=skipna, mask=mask)
# if we have a boolean op, don't coerce
- if name in ['any', 'all']:
+ if name in ["any", "all"]:
pass
# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
- elif name in ['sum', 'min', 'max', 'prod'] and notna(result):
+ elif name in ["sum", "min", "max", "prod"] and notna(result):
int_result = int(result)
if int_result == result:
result = int_result
@@ -651,8 +674,9 @@ def _maybe_mask_result(self, result, mask, other, op_name):
# if we have a float operand we are by-definition
# a float result
# or our op is a divide
- if ((is_float_dtype(other) or is_float(other)) or
- (op_name in ['rtruediv', 'truediv', 'rdiv', 'div'])):
+ if (is_float_dtype(other) or is_float(other)) or (
+ op_name in ["rtruediv", "truediv", "rdiv", "div"]
+ ):
result[mask] = np.nan
return result
@@ -669,14 +693,13 @@ def integer_arithmetic_method(self, other):
# Rely on pandas to unbox and dispatch to us.
return NotImplemented
- if getattr(other, 'ndim', 0) > 1:
- raise NotImplementedError(
- "can only perform ops with 1-d structures")
+ if getattr(other, "ndim", 0) > 1:
+ raise NotImplementedError("can only perform ops with 1-d structures")
if isinstance(other, IntegerArray):
other, mask = other._data, other._mask
- elif getattr(other, 'ndim', None) == 0:
+ elif getattr(other, "ndim", None) == 0:
other = other.item()
elif is_list_like(other):
@@ -685,8 +708,7 @@ def integer_arithmetic_method(self, other):
other = other.item()
elif other.ndim == 1:
if not (is_float_dtype(other) or is_integer_dtype(other)):
- raise TypeError(
- "can only perform ops with numeric values")
+ raise TypeError("can only perform ops with numeric values")
else:
if not (is_float(other) or is_integer(other)):
raise TypeError("can only perform ops with numeric values")
@@ -698,24 +720,26 @@ def integer_arithmetic_method(self, other):
mask = self._mask | mask
# 1 ** np.nan is 1. So we have to unmask those.
- if op_name == 'pow':
+ if op_name == "pow":
mask = np.where(self == 1, False, mask)
- elif op_name == 'rpow':
+ elif op_name == "rpow":
mask = np.where(other == 1, False, mask)
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = op(self._data, other)
# divmod returns a tuple
- if op_name == 'divmod':
+ if op_name == "divmod":
div, mod = result
- return (self._maybe_mask_result(div, mask, other, 'floordiv'),
- self._maybe_mask_result(mod, mask, other, 'mod'))
+ return (
+ self._maybe_mask_result(div, mask, other, "floordiv"),
+ self._maybe_mask_result(mod, mask, other, "mod"),
+ )
return self._maybe_mask_result(result, mask, other, op_name)
- name = '__{name}__'.format(name=op.__name__)
+ name = "__{name}__".format(name=op.__name__)
return set_function_name(integer_arithmetic_method, name, cls)
@@ -739,76 +763,108 @@ def integer_arithmetic_method(self, other):
# create the Dtype
Int8Dtype = register_extension_dtype(
- type('Int8Dtype', (_IntegerDtype, ), {
- 'type': np.int8,
- 'name': 'Int8',
- '__doc__': _dtype_docstring.format(dtype='int8')
- })
+ type(
+ "Int8Dtype",
+ (_IntegerDtype,),
+ {
+ "type": np.int8,
+ "name": "Int8",
+ "__doc__": _dtype_docstring.format(dtype="int8"),
+ },
+ )
)
Int16Dtype = register_extension_dtype(
- type('Int16Dtype', (_IntegerDtype, ), {
- 'type': np.int16,
- 'name': 'Int16',
- '__doc__': _dtype_docstring.format(dtype='int16')
- })
+ type(
+ "Int16Dtype",
+ (_IntegerDtype,),
+ {
+ "type": np.int16,
+ "name": "Int16",
+ "__doc__": _dtype_docstring.format(dtype="int16"),
+ },
+ )
)
Int32Dtype = register_extension_dtype(
- type('Int32Dtype', (_IntegerDtype, ), {
- 'type': np.int32,
- 'name': 'Int32',
- '__doc__': _dtype_docstring.format(dtype='int32')
- })
+ type(
+ "Int32Dtype",
+ (_IntegerDtype,),
+ {
+ "type": np.int32,
+ "name": "Int32",
+ "__doc__": _dtype_docstring.format(dtype="int32"),
+ },
+ )
)
Int64Dtype = register_extension_dtype(
- type('Int64Dtype', (_IntegerDtype, ), {
- 'type': np.int64,
- 'name': 'Int64',
- '__doc__': _dtype_docstring.format(dtype='int64')
- })
+ type(
+ "Int64Dtype",
+ (_IntegerDtype,),
+ {
+ "type": np.int64,
+ "name": "Int64",
+ "__doc__": _dtype_docstring.format(dtype="int64"),
+ },
+ )
)
UInt8Dtype = register_extension_dtype(
- type('UInt8Dtype', (_IntegerDtype, ), {
- 'type': np.uint8,
- 'name': 'UInt8',
- '__doc__': _dtype_docstring.format(dtype='uint8')
- })
+ type(
+ "UInt8Dtype",
+ (_IntegerDtype,),
+ {
+ "type": np.uint8,
+ "name": "UInt8",
+ "__doc__": _dtype_docstring.format(dtype="uint8"),
+ },
+ )
)
UInt16Dtype = register_extension_dtype(
- type('UInt16Dtype', (_IntegerDtype, ), {
- 'type': np.uint16,
- 'name': 'UInt16',
- '__doc__': _dtype_docstring.format(dtype='uint16')
- })
+ type(
+ "UInt16Dtype",
+ (_IntegerDtype,),
+ {
+ "type": np.uint16,
+ "name": "UInt16",
+ "__doc__": _dtype_docstring.format(dtype="uint16"),
+ },
+ )
)
UInt32Dtype = register_extension_dtype(
- type('UInt32Dtype', (_IntegerDtype, ), {
- 'type': np.uint32,
- 'name': 'UInt32',
- '__doc__': _dtype_docstring.format(dtype='uint32')
- })
+ type(
+ "UInt32Dtype",
+ (_IntegerDtype,),
+ {
+ "type": np.uint32,
+ "name": "UInt32",
+ "__doc__": _dtype_docstring.format(dtype="uint32"),
+ },
+ )
)
UInt64Dtype = register_extension_dtype(
- type('UInt64Dtype', (_IntegerDtype, ), {
- 'type': np.uint64,
- 'name': 'UInt64',
- '__doc__': _dtype_docstring.format(dtype='uint64')
- })
+ type(
+ "UInt64Dtype",
+ (_IntegerDtype,),
+ {
+ "type": np.uint64,
+ "name": "UInt64",
+ "__doc__": _dtype_docstring.format(dtype="uint64"),
+ },
+ )
)
_dtypes = {
- 'int8': Int8Dtype(),
- 'int16': Int16Dtype(),
- 'int32': Int32Dtype(),
- 'int64': Int64Dtype(),
- 'uint8': UInt8Dtype(),
- 'uint16': UInt16Dtype(),
- 'uint32': UInt32Dtype(),
- 'uint64': UInt64Dtype(),
+ "int8": Int8Dtype(),
+ "int16": Int16Dtype(),
+ "int32": Int32Dtype(),
+ "int64": Int64Dtype(),
+ "uint8": UInt8Dtype(),
+ "uint16": UInt16Dtype(),
+ "uint32": UInt32Dtype(),
+ "uint64": UInt64Dtype(),
}
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
index cf8ca25857f4e..f9fbd7ada376e 100644
--- a/pandas/core/arrays/interval.py
+++ b/pandas/core/arrays/interval.py
@@ -5,38 +5,49 @@
from pandas._config import get_option
-from pandas._libs.interval import (
- Interval, IntervalMixin, intervals_to_interval_bounds)
+from pandas._libs.interval import Interval, IntervalMixin, intervals_to_interval_bounds
from pandas.compat.numpy import function as nv
from pandas.util._decorators import Appender
from pandas.core.dtypes.cast import maybe_convert_platform
from pandas.core.dtypes.common import (
- is_categorical_dtype, is_datetime64_any_dtype, is_float_dtype,
- is_integer_dtype, is_interval, is_interval_dtype, is_scalar,
- is_string_dtype, is_timedelta64_dtype, pandas_dtype)
+ is_categorical_dtype,
+ is_datetime64_any_dtype,
+ is_float_dtype,
+ is_integer_dtype,
+ is_interval,
+ is_interval_dtype,
+ is_scalar,
+ is_string_dtype,
+ is_timedelta64_dtype,
+ pandas_dtype,
+)
from pandas.core.dtypes.dtypes import IntervalDtype
from pandas.core.dtypes.generic import (
- ABCDatetimeIndex, ABCInterval, ABCIntervalIndex, ABCPeriodIndex, ABCSeries)
+ ABCDatetimeIndex,
+ ABCInterval,
+ ABCIntervalIndex,
+ ABCPeriodIndex,
+ ABCSeries,
+)
from pandas.core.dtypes.missing import isna, notna
-from pandas.core.arrays.base import (
- ExtensionArray, _extension_array_shared_docs)
+from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs
from pandas.core.arrays.categorical import Categorical
import pandas.core.common as com
from pandas.core.indexes.base import Index, ensure_index
-_VALID_CLOSED = {'left', 'right', 'both', 'neither'}
+_VALID_CLOSED = {"left", "right", "both", "neither"}
_interval_shared_docs = {}
_shared_docs_kwargs = dict(
- klass='IntervalArray',
- qualname='arrays.IntervalArray',
- name=''
+ klass="IntervalArray", qualname="arrays.IntervalArray", name=""
)
-_interval_shared_docs['class'] = """
+_interval_shared_docs[
+ "class"
+] = """
%(summary)s
.. versionadded:: %(versionadded)s
@@ -99,14 +110,17 @@
"""
-@Appender(_interval_shared_docs['class'] % dict(
- klass="IntervalArray",
- summary="Pandas array for interval data that are closed on the same side.",
- versionadded="0.24.0",
- name='',
- extra_attributes='',
- extra_methods='',
- examples=textwrap.dedent("""\
+@Appender(
+ _interval_shared_docs["class"]
+ % dict(
+ klass="IntervalArray",
+ summary="Pandas array for interval data that are closed on the same side.",
+ versionadded="0.24.0",
+ name="",
+ extra_attributes="",
+ extra_methods="",
+ examples=textwrap.dedent(
+ """\
Examples
--------
A new ``IntervalArray`` can be constructed directly from an array-like of
@@ -120,16 +134,17 @@
It may also be constructed using one of the constructor
methods: :meth:`IntervalArray.from_arrays`,
:meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`.
- """),
-))
+ """
+ ),
+ )
+)
class IntervalArray(IntervalMixin, ExtensionArray):
dtype = IntervalDtype()
ndim = 1
can_hold_na = True
_na_value = _fill_value = np.nan
- def __new__(cls, data, closed=None, dtype=None, copy=False,
- verify_integrity=True):
+ def __new__(cls, data, closed=None, dtype=None, copy=False, verify_integrity=True):
if isinstance(data, ABCSeries) and is_interval_dtype(data):
data = data.values
@@ -142,25 +157,35 @@ def __new__(cls, data, closed=None, dtype=None, copy=False,
# don't allow scalars
if is_scalar(data):
- msg = ("{}(...) must be called with a collection of some kind,"
- " {} was passed")
+ msg = (
+ "{}(...) must be called with a collection of some kind,"
+ " {} was passed"
+ )
raise TypeError(msg.format(cls.__name__, data))
# might need to convert empty or purely na data
data = maybe_convert_platform_interval(data)
left, right, infer_closed = intervals_to_interval_bounds(
- data, validate_closed=closed is None)
+ data, validate_closed=closed is None
+ )
closed = closed or infer_closed
- return cls._simple_new(left, right, closed, copy=copy, dtype=dtype,
- verify_integrity=verify_integrity)
+ return cls._simple_new(
+ left,
+ right,
+ closed,
+ copy=copy,
+ dtype=dtype,
+ verify_integrity=verify_integrity,
+ )
@classmethod
- def _simple_new(cls, left, right, closed=None,
- copy=False, dtype=None, verify_integrity=True):
+ def _simple_new(
+ cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True
+ ):
result = IntervalMixin.__new__(cls)
- closed = closed or 'right'
+ closed = closed or "right"
left = ensure_index(left, copy=copy)
right = ensure_index(right, copy=copy)
@@ -168,7 +193,7 @@ def _simple_new(cls, left, right, closed=None,
# GH 19262: dtype must be an IntervalDtype to override inferred
dtype = pandas_dtype(dtype)
if not is_interval_dtype(dtype):
- msg = 'dtype must be an IntervalDtype, got {dtype}'
+ msg = "dtype must be an IntervalDtype, got {dtype}"
raise TypeError(msg.format(dtype=dtype))
elif dtype.subtype is not None:
left = left.astype(dtype.subtype)
@@ -181,22 +206,25 @@ def _simple_new(cls, left, right, closed=None,
left = left.astype(right.dtype)
if type(left) != type(right):
- msg = ('must not have differing left [{ltype}] and right '
- '[{rtype}] types')
- raise ValueError(msg.format(ltype=type(left).__name__,
- rtype=type(right).__name__))
+ msg = "must not have differing left [{ltype}] and right " "[{rtype}] types"
+ raise ValueError(
+ msg.format(ltype=type(left).__name__, rtype=type(right).__name__)
+ )
elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype):
# GH 19016
- msg = ('category, object, and string subtypes are not supported '
- 'for IntervalArray')
+ msg = (
+ "category, object, and string subtypes are not supported "
+ "for IntervalArray"
+ )
raise TypeError(msg)
elif isinstance(left, ABCPeriodIndex):
- msg = 'Period dtypes are not supported, use a PeriodIndex instead'
+ msg = "Period dtypes are not supported, use a PeriodIndex instead"
raise ValueError(msg)
- elif (isinstance(left, ABCDatetimeIndex) and
- str(left.tz) != str(right.tz)):
- msg = ("left and right must have the same time zone, got "
- "'{left_tz}' and '{right_tz}'")
+ elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz):
+ msg = (
+ "left and right must have the same time zone, got "
+ "'{left_tz}' and '{right_tz}'"
+ )
raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz))
result._left = left
@@ -219,7 +247,9 @@ def _from_factorized(cls, values, original):
values = values.astype(original.dtype.subtype)
return cls(values, closed=original.closed)
- _interval_shared_docs['from_breaks'] = """
+ _interval_shared_docs[
+ "from_breaks"
+ ] = """
Construct an %(klass)s from an array of splits.
Parameters
@@ -255,14 +285,15 @@ def _from_factorized(cls, values, original):
"""
@classmethod
- @Appender(_interval_shared_docs['from_breaks'] % _shared_docs_kwargs)
- def from_breaks(cls, breaks, closed='right', copy=False, dtype=None):
+ @Appender(_interval_shared_docs["from_breaks"] % _shared_docs_kwargs)
+ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None):
breaks = maybe_convert_platform_interval(breaks)
- return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy,
- dtype=dtype)
+ return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, dtype=dtype)
- _interval_shared_docs['from_arrays'] = """
+ _interval_shared_docs[
+ "from_arrays"
+ ] = """
Construct from two arrays defining the left and right bounds.
Parameters
@@ -317,15 +348,18 @@ def from_breaks(cls, breaks, closed='right', copy=False, dtype=None):
"""
@classmethod
- @Appender(_interval_shared_docs['from_arrays'] % _shared_docs_kwargs)
- def from_arrays(cls, left, right, closed='right', copy=False, dtype=None):
+ @Appender(_interval_shared_docs["from_arrays"] % _shared_docs_kwargs)
+ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None):
left = maybe_convert_platform_interval(left)
right = maybe_convert_platform_interval(right)
- return cls._simple_new(left, right, closed, copy=copy,
- dtype=dtype, verify_integrity=True)
+ return cls._simple_new(
+ left, right, closed, copy=copy, dtype=dtype, verify_integrity=True
+ )
- _interval_shared_docs['from_intervals'] = """
+ _interval_shared_docs[
+ "from_intervals"
+ ] = """
Construct an %(klass)s from a 1d array of Interval objects
.. deprecated:: 0.23.0
@@ -367,7 +401,9 @@ def from_arrays(cls, left, right, closed='right', copy=False, dtype=None):
closed='right', dtype='interval[int64]')
"""
- _interval_shared_docs['from_tuples'] = """
+ _interval_shared_docs[
+ "from_tuples"
+ ] = """
Construct an %(klass)s from an array-like of tuples
Parameters
@@ -404,8 +440,8 @@ def from_arrays(cls, left, right, closed='right', copy=False, dtype=None):
"""
@classmethod
- @Appender(_interval_shared_docs['from_tuples'] % _shared_docs_kwargs)
- def from_tuples(cls, data, closed='right', copy=False, dtype=None):
+ @Appender(_interval_shared_docs["from_tuples"] % _shared_docs_kwargs)
+ def from_tuples(cls, data, closed="right", copy=False, dtype=None):
if len(data):
left, right = [], []
else:
@@ -421,18 +457,19 @@ def from_tuples(cls, data, closed='right', copy=False, dtype=None):
# need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...]
lhs, rhs = d
except ValueError:
- msg = ('{name}.from_tuples requires tuples of '
- 'length 2, got {tpl}').format(name=name, tpl=d)
+ msg = (
+ "{name}.from_tuples requires tuples of " "length 2, got {tpl}"
+ ).format(name=name, tpl=d)
raise ValueError(msg)
except TypeError:
- msg = ('{name}.from_tuples received an invalid '
- 'item, {tpl}').format(name=name, tpl=d)
+ msg = (
+ "{name}.from_tuples received an invalid " "item, {tpl}"
+ ).format(name=name, tpl=d)
raise TypeError(msg)
left.append(lhs)
right.append(rhs)
- return cls.from_arrays(left, right, closed, copy=False,
- dtype=dtype)
+ return cls.from_arrays(left, right, closed, copy=False, dtype=dtype)
def _validate(self):
"""Verify that the IntervalArray is valid.
@@ -445,17 +482,20 @@ def _validate(self):
* left is always below right
"""
if self.closed not in _VALID_CLOSED:
- raise ValueError("invalid option for 'closed': {closed}"
- .format(closed=self.closed))
+ raise ValueError(
+ "invalid option for 'closed': {closed}".format(closed=self.closed)
+ )
if len(self.left) != len(self.right):
- raise ValueError('left and right must have the same length')
+ raise ValueError("left and right must have the same length")
left_mask = notna(self.left)
right_mask = notna(self.right)
if not (left_mask == right_mask).all():
- raise ValueError('missing values must be missing in the same '
- 'location both left and right sides')
+ raise ValueError(
+ "missing values must be missing in the same "
+ "location both left and right sides"
+ )
if not (self.left[left_mask] <= self.right[left_mask]).all():
- raise ValueError('left side of interval must be <= right side')
+ raise ValueError("left side of interval must be <= right side")
# ---------
# Interface
@@ -487,10 +527,10 @@ def __setitem__(self, key, value):
needs_float_conversion = True
elif is_datetime64_any_dtype(self.dtype.subtype):
# need proper NaT to set directly on the numpy array
- value = np.datetime64('NaT')
+ value = np.datetime64("NaT")
elif is_timedelta64_dtype(self.dtype.subtype):
# need proper NaT to set directly on the numpy array
- value = np.timedelta64('NaT')
+ value = np.timedelta64("NaT")
value_left, value_right = value, value
# scalar interval
@@ -512,13 +552,13 @@ def __setitem__(self, key, value):
# forced to copy, update the copy, and swap in the new values.
left = self.left.copy(deep=True)
if needs_float_conversion:
- left = left.astype('float')
+ left = left.astype("float")
left.values[key] = value_left
self._left = left
right = self.right.copy(deep=True)
if needs_float_conversion:
- right = right.astype('float')
+ right = right.astype("float")
right.values[key] = value_right
self._right = right
@@ -550,18 +590,20 @@ def fillna(self, value=None, method=None, limit=None):
filled : IntervalArray with NA/NaN filled
"""
if method is not None:
- raise TypeError('Filling by method is not supported for '
- 'IntervalArray.')
+ raise TypeError("Filling by method is not supported for " "IntervalArray.")
if limit is not None:
- raise TypeError('limit is not supported for IntervalArray.')
+ raise TypeError("limit is not supported for IntervalArray.")
if not isinstance(value, ABCInterval):
- msg = ("'IntervalArray.fillna' only supports filling with a "
- "scalar 'pandas.Interval'. Got a '{}' instead."
- .format(type(value).__name__))
+ msg = (
+ "'IntervalArray.fillna' only supports filling with a "
+ "scalar 'pandas.Interval'. Got a '{}' instead.".format(
+ type(value).__name__
+ )
+ )
raise TypeError(msg)
- value = getattr(value, '_values', value)
+ value = getattr(value, "_values", value)
self._check_closed_matches(value, name="value")
left = self.left.fillna(value=value.left)
@@ -601,8 +643,10 @@ def astype(self, dtype, copy=True):
new_left = self.left.astype(dtype.subtype)
new_right = self.right.astype(dtype.subtype)
except TypeError:
- msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are '
- 'incompatible')
+ msg = (
+ "Cannot convert {dtype} to {new_dtype}; subtypes are "
+ "incompatible"
+ )
raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype))
return self._shallow_copy(new_left, new_right)
elif is_categorical_dtype(dtype):
@@ -611,7 +655,7 @@ def astype(self, dtype, copy=True):
try:
return np.asarray(self).astype(dtype, copy=copy)
except (TypeError, ValueError):
- msg = 'Cannot cast {name} to dtype {dtype}'
+ msg = "Cannot cast {name} to dtype {dtype}"
raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
@classmethod
@@ -674,8 +718,7 @@ def _shallow_copy(self, left=None, right=None, closed=None):
pass
closed = closed or self.closed
- return self._simple_new(
- left, right, closed=closed, verify_integrity=False)
+ return self._simple_new(left, right, closed=closed, verify_integrity=False)
def copy(self):
"""
@@ -707,8 +750,7 @@ def size(self):
def shape(self):
return self.left.shape
- def take(self, indices, allow_fill=False, fill_value=None, axis=None,
- **kwargs):
+ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs):
"""
Take elements from the IntervalArray.
@@ -763,18 +805,23 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None,
if fill_value is None:
fill_left = fill_right = self.left._na_value
elif is_interval(fill_value):
- self._check_closed_matches(fill_value, name='fill_value')
+ self._check_closed_matches(fill_value, name="fill_value")
fill_left, fill_right = fill_value.left, fill_value.right
elif not is_scalar(fill_value) and notna(fill_value):
- msg = ("'IntervalArray.fillna' only supports filling with a "
- "'scalar pandas.Interval or NA'. Got a '{}' instead."
- .format(type(fill_value).__name__))
+ msg = (
+ "'IntervalArray.fillna' only supports filling with a "
+ "'scalar pandas.Interval or NA'. Got a '{}' instead.".format(
+ type(fill_value).__name__
+ )
+ )
raise ValueError(msg)
- left_take = take(self.left, indices,
- allow_fill=allow_fill, fill_value=fill_left)
- right_take = take(self.right, indices,
- allow_fill=allow_fill, fill_value=fill_right)
+ left_take = take(
+ self.left, indices, allow_fill=allow_fill, fill_value=fill_left
+ )
+ right_take = take(
+ self.right, indices, allow_fill=allow_fill, fill_value=fill_right
+ )
return self._shallow_copy(left_take, right_take)
@@ -797,6 +844,7 @@ def value_counts(self, dropna=True):
"""
# TODO: implement this is a non-naive way!
from pandas.core.algorithms import value_counts
+
return value_counts(np.asarray(self), dropna=dropna)
# Formatting
@@ -806,46 +854,51 @@ def _format_data(self):
# TODO: integrate with categorical and make generic
# name argument is unused here; just for compat with base / categorical
n = len(self)
- max_seq_items = min((get_option(
- 'display.max_seq_items') or n) // 10, 10)
+ max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10)
formatter = str
if n == 0:
- summary = '[]'
+ summary = "[]"
elif n == 1:
first = formatter(self[0])
- summary = '[{first}]'.format(first=first)
+ summary = "[{first}]".format(first=first)
elif n == 2:
first = formatter(self[0])
last = formatter(self[-1])
- summary = '[{first}, {last}]'.format(first=first, last=last)
+ summary = "[{first}, {last}]".format(first=first, last=last)
else:
if n > max_seq_items:
n = min(max_seq_items // 2, 10)
head = [formatter(x) for x in self[:n]]
tail = [formatter(x) for x in self[-n:]]
- summary = '[{head} ... {tail}]'.format(
- head=', '.join(head), tail=', '.join(tail))
+ summary = "[{head} ... {tail}]".format(
+ head=", ".join(head), tail=", ".join(tail)
+ )
else:
tail = [formatter(x) for x in self]
- summary = '[{tail}]'.format(tail=', '.join(tail))
+ summary = "[{tail}]".format(tail=", ".join(tail))
return summary
def __repr__(self):
- tpl = textwrap.dedent("""\
+ tpl = textwrap.dedent(
+ """\
{cls}({data},
{lead}closed='{closed}',
- {lead}dtype='{dtype}')""")
- return tpl.format(cls=self.__class__.__name__,
- data=self._format_data(),
- lead=' ' * len(self.__class__.__name__) + ' ',
- closed=self.closed, dtype=self.dtype)
+ {lead}dtype='{dtype}')"""
+ )
+ return tpl.format(
+ cls=self.__class__.__name__,
+ data=self._format_data(),
+ lead=" " * len(self.__class__.__name__) + " ",
+ closed=self.closed,
+ dtype=self.dtype,
+ )
def _format_space(self):
- space = ' ' * (len(self.__class__.__name__) + 1)
+ space = " " * (len(self.__class__.__name__) + 1)
return "\n{space}".format(space=space)
@property
@@ -872,7 +925,9 @@ def closed(self):
"""
return self._closed
- _interval_shared_docs['set_closed'] = """
+ _interval_shared_docs[
+ "set_closed"
+ ] = """
Return an %(klass)s identical to the current one, but closed on the
specified side
@@ -901,7 +956,7 @@ def closed(self):
dtype='interval[int64]')
"""
- @Appender(_interval_shared_docs['set_closed'] % _shared_docs_kwargs)
+ @Appender(_interval_shared_docs["set_closed"] % _shared_docs_kwargs)
def set_closed(self, closed):
if closed not in _VALID_CLOSED:
msg = "invalid option for 'closed': {closed}"
@@ -919,8 +974,10 @@ def length(self):
return self.right - self.left
except TypeError:
# length not defined for some types, e.g. string
- msg = ('IntervalArray contains Intervals without defined length, '
- 'e.g. Intervals with string endpoints')
+ msg = (
+ "IntervalArray contains Intervals without defined length, "
+ "e.g. Intervals with string endpoints"
+ )
raise TypeError(msg)
@property
@@ -934,7 +991,9 @@ def mid(self):
# datetime safe version
return self.left + 0.5 * self.length
- _interval_shared_docs['is_non_overlapping_monotonic'] = """
+ _interval_shared_docs[
+ "is_non_overlapping_monotonic"
+ ] = """
Return True if the %(klass)s is non-overlapping (no Intervals share
points) and is either monotonic increasing or monotonic decreasing,
else False
@@ -942,8 +1001,9 @@ def mid(self):
# https://github.com/python/mypy/issues/1362
# Mypy does not support decorated properties
@property # type: ignore
- @Appender(_interval_shared_docs['is_non_overlapping_monotonic']
- % _shared_docs_kwargs)
+ @Appender(
+ _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs
+ )
def is_non_overlapping_monotonic(self):
# must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... )
# or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...)
@@ -951,14 +1011,18 @@ def is_non_overlapping_monotonic(self):
# strict inequality for closed == 'both'; equality implies overlapping
# at a point when both sides of intervals are included
- if self.closed == 'both':
- return bool((self.right[:-1] < self.left[1:]).all() or
- (self.left[:-1] > self.right[1:]).all())
+ if self.closed == "both":
+ return bool(
+ (self.right[:-1] < self.left[1:]).all()
+ or (self.left[:-1] > self.right[1:]).all()
+ )
# non-strict inequality when closed != 'both'; at least one side is
# not included in the intervals, so equality does not imply overlapping
- return bool((self.right[:-1] <= self.left[1:]).all() or
- (self.left[:-1] >= self.right[1:]).all())
+ return bool(
+ (self.right[:-1] <= self.left[1:]).all()
+ or (self.left[:-1] >= self.right[1:]).all()
+ )
# Conversion
def __array__(self, dtype=None):
@@ -979,7 +1043,9 @@ def __array__(self, dtype=None):
result[i] = Interval(left[i], right[i], closed)
return result
- _interval_shared_docs['to_tuples'] = """
+ _interval_shared_docs[
+ "to_tuples"
+ ] = """
Return an %(return_type)s of tuples of the form (left, right)
Parameters
@@ -996,10 +1062,9 @@ def __array__(self, dtype=None):
%(examples)s\
"""
- @Appender(_interval_shared_docs['to_tuples'] % dict(
- return_type='ndarray',
- examples='',
- ))
+ @Appender(
+ _interval_shared_docs["to_tuples"] % dict(return_type="ndarray", examples="")
+ )
def to_tuples(self, na_tuple=True):
tuples = com.asarray_tuplesafe(zip(self.left, self.right))
if not na_tuple:
@@ -1007,14 +1072,16 @@ def to_tuples(self, na_tuple=True):
tuples = np.where(~self.isna(), tuples, np.nan)
return tuples
- @Appender(_extension_array_shared_docs['repeat'] % _shared_docs_kwargs)
+ @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs)
def repeat(self, repeats, axis=None):
nv.validate_repeat(tuple(), dict(axis=axis))
left_repeat = self.left.repeat(repeats)
right_repeat = self.right.repeat(repeats)
return self._shallow_copy(left=left_repeat, right=right_repeat)
- _interval_shared_docs['contains'] = """
+ _interval_shared_docs[
+ "contains"
+ ] = """
Check elementwise if the Intervals contain the value.
Return a boolean mask whether the value is contained in the Intervals
@@ -1048,19 +1115,18 @@ def repeat(self, repeats, axis=None):
array([ True, False, False])
"""
- @Appender(_interval_shared_docs['contains'] % _shared_docs_kwargs)
+ @Appender(_interval_shared_docs["contains"] % _shared_docs_kwargs)
def contains(self, other):
if isinstance(other, Interval):
- raise NotImplementedError(
- 'contains not implemented for two intervals'
- )
+ raise NotImplementedError("contains not implemented for two intervals")
- return (
- (self.left < other if self.open_left else self.left <= other) &
- (other < self.right if self.open_right else other <= self.right)
+ return (self.left < other if self.open_left else self.left <= other) & (
+ other < self.right if self.open_right else other <= self.right
)
- _interval_shared_docs['overlaps'] = """
+ _interval_shared_docs[
+ "overlaps"
+ ] = """
Check elementwise if an Interval overlaps the values in the %(klass)s.
Two intervals overlap if they share a common point, including closed
@@ -1104,12 +1170,12 @@ def contains(self, other):
array([False, True, False])
"""
- @Appender(_interval_shared_docs['overlaps'] % _shared_docs_kwargs)
+ @Appender(_interval_shared_docs["overlaps"] % _shared_docs_kwargs)
def overlaps(self, other):
if isinstance(other, (IntervalArray, ABCIntervalIndex)):
raise NotImplementedError
elif not isinstance(other, Interval):
- msg = '`other` must be Interval-like, got {other}'
+ msg = "`other` must be Interval-like, got {other}"
raise TypeError(msg.format(other=type(other).__name__))
# equality is okay if both endpoints are closed (overlap at a point)
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index 1c5dc7666c3a1..9f428a4ac10b2 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -33,7 +33,8 @@ class PandasDtype(ExtensionDtype):
----------
dtype : numpy.dtype
"""
- _metadata = ('_dtype',)
+
+ _metadata = ("_dtype",)
def __init__(self, dtype):
dtype = np.dtype(dtype)
@@ -60,11 +61,11 @@ def type(self):
@property
def _is_numeric(self):
# exclude object, str, unicode, void.
- return self.kind in set('biufc')
+ return self.kind in set("biufc")
@property
def _is_boolean(self):
- return self.kind == 'b'
+ return self.kind == "b"
@classmethod
def construct_from_string(cls, string):
@@ -107,6 +108,7 @@ class PandasArray(ExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin):
-------
None
"""
+
# If you're wondering why pd.Series(cls) doesn't put the array in an
# ExtensionBlock, search for `ABCPandasArray`. We check for
# that _typ to ensure that that users don't unnecessarily use EAs inside
@@ -171,7 +173,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# numpy.lib.mixins.NDArrayOperatorsMixin.html
# The primary modification is not boxing scalar return values
# in PandasArray, since pandas' ExtensionArrays are 1-d.
- out = kwargs.get('out', ())
+ out = kwargs.get("out", ())
for x in inputs + out:
# Only support operations with instances of _HANDLED_TYPES.
# Use PandasArray instead of type(self) for isinstance to
@@ -181,12 +183,11 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
return NotImplemented
# Defer to the implementation of the ufunc on unwrapped values.
- inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x
- for x in inputs)
+ inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x for x in inputs)
if out:
- kwargs['out'] = tuple(
- x._ndarray if isinstance(x, PandasArray) else x
- for x in out)
+ kwargs["out"] = tuple(
+ x._ndarray if isinstance(x, PandasArray) else x for x in out
+ )
result = getattr(ufunc, method)(*inputs, **kwargs)
if type(result) is tuple and len(result):
@@ -197,7 +198,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
else:
# but not scalar reductions
return result
- elif method == 'at':
+ elif method == "at":
# no return value
return None
else:
@@ -233,7 +234,7 @@ def __setitem__(self, key, value):
values = self._ndarray
t = np.result_type(value, values)
if t != self._ndarray.dtype:
- values = values.astype(t, casting='safe')
+ values = values.astype(t, casting="safe")
values[key] = value
self._dtype = PandasDtype(t)
self._ndarray = values
@@ -260,15 +261,16 @@ def fillna(self, value=None, method=None, limit=None):
if is_array_like(value):
if len(value) != len(self):
- raise ValueError("Length of 'value' does not match. Got ({}) "
- " expected {}".format(len(value), len(self)))
+ raise ValueError(
+ "Length of 'value' does not match. Got ({}) "
+ " expected {}".format(len(value), len(self))
+ )
value = value[mask]
if mask.any():
if method is not None:
- func = pad_1d if method == 'pad' else backfill_1d
- new_values = func(self._ndarray, limit=limit,
- mask=mask)
+ func = pad_1d if method == "pad" else backfill_1d
+ new_values = func(self._ndarray, limit=limit, mask=mask)
new_values = self._from_sequence(new_values, dtype=self.dtype)
else:
# fill with value
@@ -281,8 +283,9 @@ def fillna(self, value=None, method=None, limit=None):
def take(self, indices, allow_fill=False, fill_value=None):
from pandas.core.algorithms import take
- result = take(self._ndarray, indices, allow_fill=allow_fill,
- fill_value=fill_value)
+ result = take(
+ self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value
+ )
return type(self)(result)
def copy(self):
@@ -307,9 +310,7 @@ def _reduce(self, name, skipna=True, **kwargs):
if meth:
return meth(skipna=skipna, **kwargs)
else:
- msg = (
- "'{}' does not implement reduction '{}'"
- )
+ msg = "'{}' does not implement reduction '{}'"
raise TypeError(msg.format(type(self).__name__, name))
def any(self, axis=None, out=None, keepdims=False, skipna=True):
@@ -328,67 +329,80 @@ def max(self, axis=None, out=None, keepdims=False, skipna=True):
nv.validate_max((), dict(out=out, keepdims=keepdims))
return nanops.nanmax(self._ndarray, axis=axis, skipna=skipna)
- def sum(self, axis=None, dtype=None, out=None, keepdims=False,
- initial=None, skipna=True, min_count=0):
- nv.validate_sum((), dict(dtype=dtype, out=out, keepdims=keepdims,
- initial=initial))
- return nanops.nansum(self._ndarray, axis=axis, skipna=skipna,
- min_count=min_count)
-
- def prod(self, axis=None, dtype=None, out=None, keepdims=False,
- initial=None, skipna=True, min_count=0):
- nv.validate_prod((), dict(dtype=dtype, out=out, keepdims=keepdims,
- initial=initial))
- return nanops.nanprod(self._ndarray, axis=axis, skipna=skipna,
- min_count=min_count)
-
- def mean(self, axis=None, dtype=None, out=None, keepdims=False,
- skipna=True):
+ def sum(
+ self,
+ axis=None,
+ dtype=None,
+ out=None,
+ keepdims=False,
+ initial=None,
+ skipna=True,
+ min_count=0,
+ ):
+ nv.validate_sum(
+ (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial)
+ )
+ return nanops.nansum(
+ self._ndarray, axis=axis, skipna=skipna, min_count=min_count
+ )
+
+ def prod(
+ self,
+ axis=None,
+ dtype=None,
+ out=None,
+ keepdims=False,
+ initial=None,
+ skipna=True,
+ min_count=0,
+ ):
+ nv.validate_prod(
+ (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial)
+ )
+ return nanops.nanprod(
+ self._ndarray, axis=axis, skipna=skipna, min_count=min_count
+ )
+
+ def mean(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True):
nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims))
return nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
- def median(self, axis=None, out=None, overwrite_input=False,
- keepdims=False, skipna=True):
- nv.validate_median((), dict(out=out, overwrite_input=overwrite_input,
- keepdims=keepdims))
+ def median(
+ self, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True
+ ):
+ nv.validate_median(
+ (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims)
+ )
return nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
- def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False,
- skipna=True):
- nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out,
- keepdims=keepdims),
- fname='std')
- return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna,
- ddof=ddof)
-
- def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False,
- skipna=True):
- nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out,
- keepdims=keepdims),
- fname='var')
- return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna,
- ddof=ddof)
-
- def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False,
- skipna=True):
- nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out,
- keepdims=keepdims),
- fname='sem')
- return nanops.nansem(self._ndarray, axis=axis, skipna=skipna,
- ddof=ddof)
-
- def kurt(self, axis=None, dtype=None, out=None, keepdims=False,
- skipna=True):
- nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out,
- keepdims=keepdims),
- fname='kurt')
+ def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True):
+ nv.validate_stat_ddof_func(
+ (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std"
+ )
+ return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
+
+ def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True):
+ nv.validate_stat_ddof_func(
+ (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="var"
+ )
+ return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
+
+ def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True):
+ nv.validate_stat_ddof_func(
+ (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="sem"
+ )
+ return nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
+
+ def kurt(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True):
+ nv.validate_stat_ddof_func(
+ (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="kurt"
+ )
return nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
- def skew(self, axis=None, dtype=None, out=None, keepdims=False,
- skipna=True):
- nv.validate_stat_ddof_func((), dict(dtype=dtype, out=out,
- keepdims=keepdims),
- fname='skew')
+ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True):
+ nv.validate_stat_ddof_func(
+ (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="skew"
+ )
return nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
# ------------------------------------------------------------------------
@@ -417,9 +431,8 @@ def to_numpy(self, dtype=None, copy=False):
return result
@Appender(ExtensionArray.searchsorted.__doc__)
- def searchsorted(self, value, side='left', sorter=None):
- return searchsorted(self.to_numpy(), value,
- side=side, sorter=sorter)
+ def searchsorted(self, value, side="left", sorter=None):
+ return searchsorted(self.to_numpy(), value, side=side, sorter=sorter)
# ------------------------------------------------------------------------
# Ops
@@ -445,9 +458,9 @@ def arithmetic_method(self, other):
return cls(result)
- return compat.set_function_name(arithmetic_method,
- "__{}__".format(op.__name__),
- cls)
+ return compat.set_function_name(
+ arithmetic_method, "__{}__".format(op.__name__), cls
+ )
_create_comparison_method = _create_arithmetic_method
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
index bb144764a26fc..8291cb70affcd 100644
--- a/pandas/core/arrays/period.py
+++ b/pandas/core/arrays/period.py
@@ -6,21 +6,41 @@
from pandas._libs import lib
from pandas._libs.tslibs import (
- NaT, NaTType, frequencies as libfrequencies, iNaT, period as libperiod)
+ NaT,
+ NaTType,
+ frequencies as libfrequencies,
+ iNaT,
+ period as libperiod,
+)
from pandas._libs.tslibs.fields import isleapyear_arr
from pandas._libs.tslibs.period import (
- DIFFERENT_FREQ, IncompatibleFrequency, Period, get_period_field_arr,
- period_asfreq_arr)
+ DIFFERENT_FREQ,
+ IncompatibleFrequency,
+ Period,
+ get_period_field_arr,
+ period_asfreq_arr,
+)
from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds
import pandas.compat as compat
from pandas.util._decorators import Appender, cache_readonly
from pandas.core.dtypes.common import (
- _TD_DTYPE, ensure_object, is_datetime64_dtype, is_float_dtype,
- is_list_like, is_period_dtype, pandas_dtype)
+ _TD_DTYPE,
+ ensure_object,
+ is_datetime64_dtype,
+ is_float_dtype,
+ is_list_like,
+ is_period_dtype,
+ pandas_dtype,
+)
from pandas.core.dtypes.dtypes import PeriodDtype
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCIndexClass, ABCPeriodArray, ABCPeriodIndex, ABCSeries)
+ ABCDataFrame,
+ ABCIndexClass,
+ ABCPeriodArray,
+ ABCPeriodIndex,
+ ABCSeries,
+)
from pandas.core.dtypes.missing import isna, notna
import pandas.core.algorithms as algos
@@ -46,8 +66,8 @@ def _period_array_cmp(cls, op):
"""
Wrap comparison operations to convert Period-like to PeriodDtype
"""
- opname = '__{name}__'.format(name=op.__name__)
- nat_result = opname == '__ne__'
+ opname = "__{name}__".format(name=op.__name__)
+ nat_result = opname == "__ne__"
def wrapper(self, other):
op = getattr(self.asi8, opname)
@@ -138,6 +158,7 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps):
The `freq` indicates the span covered by each element of the array.
All elements in the PeriodArray have the same `freq`.
"""
+
# array priority higher than numpy scalars
__array_priority__ = 1000
_attributes = ["freq"]
@@ -146,14 +167,27 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps):
# Names others delegate to us
_other_ops = [] # type: List[str]
- _bool_ops = ['is_leap_year']
- _object_ops = ['start_time', 'end_time', 'freq']
- _field_ops = ['year', 'month', 'day', 'hour', 'minute', 'second',
- 'weekofyear', 'weekday', 'week', 'dayofweek',
- 'dayofyear', 'quarter', 'qyear',
- 'days_in_month', 'daysinmonth']
+ _bool_ops = ["is_leap_year"]
+ _object_ops = ["start_time", "end_time", "freq"]
+ _field_ops = [
+ "year",
+ "month",
+ "day",
+ "hour",
+ "minute",
+ "second",
+ "weekofyear",
+ "weekday",
+ "week",
+ "dayofweek",
+ "dayofyear",
+ "quarter",
+ "qyear",
+ "days_in_month",
+ "daysinmonth",
+ ]
_datetimelike_ops = _field_ops + _object_ops + _bool_ops
- _datetimelike_methods = ['strftime', 'to_timestamp', 'asfreq']
+ _datetimelike_methods = ["strftime", "to_timestamp", "asfreq"]
# --------------------------------------------------------------------
# Constructors
@@ -174,16 +208,18 @@ def __init__(self, values, freq=None, dtype=None, copy=False):
if isinstance(values, type(self)):
if freq is not None and freq != values.freq:
- msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
- own_freq=values.freq.freqstr,
- other_freq=freq.freqstr)
+ msg = DIFFERENT_FREQ.format(
+ cls=type(self).__name__,
+ own_freq=values.freq.freqstr,
+ other_freq=freq.freqstr,
+ )
raise IncompatibleFrequency(msg)
values, freq = values._data, values.freq
- values = np.array(values, dtype='int64', copy=copy)
+ values = np.array(values, dtype="int64", copy=copy)
self._data = values
if freq is None:
- raise ValueError('freq is not specified and cannot be inferred')
+ raise ValueError("freq is not specified and cannot be inferred")
self._dtype = PeriodDtype(freq)
@classmethod
@@ -193,10 +229,10 @@ def _simple_new(cls, values, freq=None, **kwargs):
@classmethod
def _from_sequence(
- cls,
- scalars: Sequence[Optional[Period]],
- dtype: Optional[PeriodDtype] = None,
- copy: bool = False,
+ cls,
+ scalars: Sequence[Optional[Period]],
+ dtype: Optional[PeriodDtype] = None,
+ copy: bool = False,
) -> ABCPeriodArray:
if dtype:
freq = dtype.freq
@@ -245,14 +281,14 @@ def _generate_range(cls, start, end, periods, freq, fields):
field_count = len(fields)
if start is not None or end is not None:
if field_count > 0:
- raise ValueError('Can either instantiate from fields '
- 'or endpoints, but not both')
+ raise ValueError(
+ "Can either instantiate from fields " "or endpoints, but not both"
+ )
subarr, freq = _get_ordinal_range(start, end, periods, freq)
elif field_count > 0:
subarr, freq = _range_from_fields(freq=freq, **fields)
else:
- raise ValueError('Not enough parameters to construct '
- 'Period range')
+ raise ValueError("Not enough parameters to construct " "Period range")
return subarr, freq
@@ -267,8 +303,9 @@ def _unbox_scalar(self, value: Union[Period, NaTType]) -> int:
self._check_compatible_with(value)
return value.ordinal
else:
- raise ValueError("'value' should be a Period. Got '{val}' instead."
- .format(val=value))
+ raise ValueError(
+ "'value' should be a Period. Got '{val}' instead.".format(val=value)
+ )
def _scalar_from_string(self, value: str) -> Period:
return Period(value, freq=self.freq)
@@ -301,23 +338,26 @@ def __array__(self, dtype=None):
# --------------------------------------------------------------------
# Vectorized analogues of Period properties
- year = _field_accessor('year', 0, "The year of the period")
- month = _field_accessor('month', 3, "The month as January=1, December=12")
- day = _field_accessor('day', 4, "The days of the period")
- hour = _field_accessor('hour', 5, "The hour of the period")
- minute = _field_accessor('minute', 6, "The minute of the period")
- second = _field_accessor('second', 7, "The second of the period")
- weekofyear = _field_accessor('week', 8, "The week ordinal of the year")
+ year = _field_accessor("year", 0, "The year of the period")
+ month = _field_accessor("month", 3, "The month as January=1, December=12")
+ day = _field_accessor("day", 4, "The days of the period")
+ hour = _field_accessor("hour", 5, "The hour of the period")
+ minute = _field_accessor("minute", 6, "The minute of the period")
+ second = _field_accessor("second", 7, "The second of the period")
+ weekofyear = _field_accessor("week", 8, "The week ordinal of the year")
week = weekofyear
- dayofweek = _field_accessor('dayofweek', 10,
- "The day of the week with Monday=0, Sunday=6")
+ dayofweek = _field_accessor(
+ "dayofweek", 10, "The day of the week with Monday=0, Sunday=6"
+ )
weekday = dayofweek
- dayofyear = day_of_year = _field_accessor('dayofyear', 9,
- "The ordinal day of the year")
- quarter = _field_accessor('quarter', 2, "The quarter of the date")
- qyear = _field_accessor('qyear', 1)
- days_in_month = _field_accessor('days_in_month', 11,
- "The number of days in the month")
+ dayofyear = day_of_year = _field_accessor(
+ "dayofyear", 9, "The ordinal day of the year"
+ )
+ quarter = _field_accessor("quarter", 2, "The quarter of the date")
+ qyear = _field_accessor("qyear", 1)
+ days_in_month = _field_accessor(
+ "days_in_month", 11, "The number of days in the month"
+ )
daysinmonth = days_in_month
@property
@@ -329,13 +369,13 @@ def is_leap_year(self):
@property
def start_time(self):
- return self.to_timestamp(how='start')
+ return self.to_timestamp(how="start")
@property
def end_time(self):
- return self.to_timestamp(how='end')
+ return self.to_timestamp(how="end")
- def to_timestamp(self, freq=None, how='start'):
+ def to_timestamp(self, freq=None, how="start"):
"""
Cast to DatetimeArray/Index.
@@ -354,15 +394,15 @@ def to_timestamp(self, freq=None, how='start'):
how = libperiod._validate_end_alias(how)
- end = how == 'E'
+ end = how == "E"
if end:
- if freq == 'B':
+ if freq == "B":
# roll forward to ensure we land on B date
- adjust = Timedelta(1, 'D') - Timedelta(1, 'ns')
- return self.to_timestamp(how='start') + adjust
+ adjust = Timedelta(1, "D") - Timedelta(1, "ns")
+ return self.to_timestamp(how="start") + adjust
else:
- adjust = Timedelta(1, 'ns')
- return (self + self.freq).to_timestamp(how='start') - adjust
+ adjust = Timedelta(1, "ns")
+ return (self + self.freq).to_timestamp(how="start") - adjust
if freq is None:
base, mult = libfrequencies.get_freq_code(self.freq)
@@ -374,7 +414,7 @@ def to_timestamp(self, freq=None, how='start'):
new_data = self.asfreq(freq, how=how)
new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base)
- return DatetimeArray._from_sequence(new_data, freq='infer')
+ return DatetimeArray._from_sequence(new_data, freq="infer")
# --------------------------------------------------------------------
# Array-like / EA-Interface Methods
@@ -392,8 +432,10 @@ def _validate_fill_value(self, fill_value):
self._check_compatible_with(fill_value)
fill_value = fill_value.ordinal
else:
- raise ValueError("'fill_value' should be a Period. "
- "Got '{got}'.".format(got=fill_value))
+ raise ValueError(
+ "'fill_value' should be a Period. "
+ "Got '{got}'.".format(got=fill_value)
+ )
return fill_value
# --------------------------------------------------------------------
@@ -414,9 +456,10 @@ def _time_shift(self, periods, freq=None):
Frequency increment to shift by.
"""
if freq is not None:
- raise TypeError("`freq` argument is not supported for "
- "{cls}._time_shift"
- .format(cls=type(self).__name__))
+ raise TypeError(
+ "`freq` argument is not supported for "
+ "{cls}._time_shift".format(cls=type(self).__name__)
+ )
values = self.asi8 + periods * self.freq.n
if self._hasnans:
values[self._isnan] = iNaT
@@ -426,7 +469,7 @@ def _time_shift(self, periods, freq=None):
def _box_func(self):
return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq)
- def asfreq(self, freq=None, how='E'):
+ def asfreq(self, freq=None, how="E"):
"""
Convert the Period Array/Index to the specified frequency `freq`.
@@ -469,7 +512,7 @@ def asfreq(self, freq=None, how='E'):
asi8 = self.asi8
# mult1 can't be negative or 0
- end = how == 'E'
+ end = how == "E"
if end:
ordinal = asi8 + mult1 - 1
else:
@@ -485,7 +528,7 @@ def asfreq(self, freq=None, how='E'):
# ------------------------------------------------------------------
# Rendering Methods
- def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs):
+ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs):
"""
actually format my specific types
"""
@@ -494,14 +537,13 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs):
if date_format:
formatter = lambda dt: dt.strftime(date_format)
else:
- formatter = lambda dt: '%s' % dt
+ formatter = lambda dt: "%s" % dt
if self._hasnans:
mask = self._isnan
values[mask] = na_rep
imask = ~mask
- values[imask] = np.array([formatter(dt) for dt
- in values[imask]])
+ values[imask] = np.array([formatter(dt) for dt in values[imask]])
else:
values = np.array([formatter(dt) for dt in values])
return values
@@ -548,17 +590,15 @@ def _sub_period(self, other):
@Appender(dtl.DatetimeLikeArrayMixin._addsub_int_array.__doc__)
def _addsub_int_array(
- self,
- other: Union[ABCPeriodArray, ABCSeries,
- ABCPeriodIndex, np.ndarray],
- op: Callable[[Any], Any]
+ self,
+ other: Union[ABCPeriodArray, ABCSeries, ABCPeriodIndex, np.ndarray],
+ op: Callable[[Any], Any],
) -> ABCPeriodArray:
assert op in [operator.add, operator.sub]
if op is operator.sub:
other = -other
- res_values = algos.checked_add_with_arr(self.asi8, other,
- arr_mask=self._isnan)
- res_values = res_values.view('i8')
+ res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan)
+ res_values = res_values.view("i8")
res_values[self._isnan] = iNaT
return type(self)(res_values, freq=self.freq)
@@ -663,12 +703,12 @@ def _check_timedeltalike_freq_compat(self, other):
elif isinstance(other, np.ndarray):
# numpy timedelta64 array; all entries must be compatible
- assert other.dtype.kind == 'm'
+ assert other.dtype.kind == "m"
if other.dtype != _TD_DTYPE:
# i.e. non-nano unit
# TODO: disallow unit-less timedelta64
other = other.astype(_TD_DTYPE)
- nanos = other.view('i8')
+ nanos = other.view("i8")
else:
# TimedeltaArray/Index
nanos = other.asi8
@@ -712,19 +752,18 @@ def _raise_on_incompatible(left, right):
else:
other_freq = _delta_to_tick(Timedelta(right)).freqstr
- msg = DIFFERENT_FREQ.format(cls=type(left).__name__,
- own_freq=left.freqstr,
- other_freq=other_freq)
+ msg = DIFFERENT_FREQ.format(
+ cls=type(left).__name__, own_freq=left.freqstr, other_freq=other_freq
+ )
raise IncompatibleFrequency(msg)
# -------------------------------------------------------------------
# Constructor Helpers
+
def period_array(
- data: Sequence[Optional[Period]],
- freq: Optional[Tick] = None,
- copy: bool = False,
+ data: Sequence[Optional[Period]], freq: Optional[Tick] = None, copy: bool = False
) -> PeriodArray:
"""
Construct a new PeriodArray from a sequence of Period scalars.
@@ -796,8 +835,7 @@ def period_array(
dtype = None
if is_float_dtype(data) and len(data) > 0:
- raise TypeError("PeriodIndex does not allow "
- "floating point in construction")
+ raise TypeError("PeriodIndex does not allow " "floating point in construction")
data = ensure_object(data)
@@ -829,12 +867,11 @@ def validate_dtype_freq(dtype, freq):
if dtype is not None:
dtype = pandas_dtype(dtype)
if not is_period_dtype(dtype):
- raise ValueError('dtype must be PeriodDtype')
+ raise ValueError("dtype must be PeriodDtype")
if freq is None:
freq = dtype.freq
elif freq != dtype.freq:
- raise IncompatibleFrequency('specified freq and dtype '
- 'are different')
+ raise IncompatibleFrequency("specified freq and dtype " "are different")
return freq
@@ -858,8 +895,8 @@ def dt64arr_to_periodarr(data, freq, tz=None):
used.
"""
- if data.dtype != np.dtype('M8[ns]'):
- raise ValueError('Wrong dtype: {dtype}'.format(dtype=data.dtype))
+ if data.dtype != np.dtype("M8[ns]"):
+ raise ValueError("Wrong dtype: {dtype}".format(dtype=data.dtype))
if freq is None:
if isinstance(data, ABCIndexClass):
@@ -873,13 +910,15 @@ def dt64arr_to_periodarr(data, freq, tz=None):
data = data._values
base, mult = libfrequencies.get_freq_code(freq)
- return libperiod.dt64arr_to_periodarr(data.view('i8'), base, tz), freq
+ return libperiod.dt64arr_to_periodarr(data.view("i8"), base, tz), freq
def _get_ordinal_range(start, end, periods, freq, mult=1):
if com.count_not_none(start, end, periods) != 2:
- raise ValueError('Of the three parameters: start, end, and periods, '
- 'exactly two must be specified')
+ raise ValueError(
+ "Of the three parameters: start, end, and periods, "
+ "exactly two must be specified"
+ )
if freq is not None:
_, mult = libfrequencies.get_freq_code(freq)
@@ -893,9 +932,9 @@ def _get_ordinal_range(start, end, periods, freq, mult=1):
is_end_per = isinstance(end, Period)
if is_start_per and is_end_per and start.freq != end.freq:
- raise ValueError('start and end must have same freq')
- if (start is NaT or end is NaT):
- raise ValueError('start and end must not be NaT')
+ raise ValueError("start and end must have same freq")
+ if start is NaT or end is NaT:
+ raise ValueError("start and end must not be NaT")
if freq is None:
if is_start_per:
@@ -903,25 +942,34 @@ def _get_ordinal_range(start, end, periods, freq, mult=1):
elif is_end_per:
freq = end.freq
else: # pragma: no cover
- raise ValueError('Could not infer freq from start/end')
+ raise ValueError("Could not infer freq from start/end")
if periods is not None:
periods = periods * mult
if start is None:
- data = np.arange(end.ordinal - periods + mult,
- end.ordinal + 1, mult,
- dtype=np.int64)
+ data = np.arange(
+ end.ordinal - periods + mult, end.ordinal + 1, mult, dtype=np.int64
+ )
else:
- data = np.arange(start.ordinal, start.ordinal + periods, mult,
- dtype=np.int64)
+ data = np.arange(
+ start.ordinal, start.ordinal + periods, mult, dtype=np.int64
+ )
else:
data = np.arange(start.ordinal, end.ordinal + 1, mult, dtype=np.int64)
return data, freq
-def _range_from_fields(year=None, month=None, quarter=None, day=None,
- hour=None, minute=None, second=None, freq=None):
+def _range_from_fields(
+ year=None,
+ month=None,
+ quarter=None,
+ day=None,
+ hour=None,
+ minute=None,
+ second=None,
+ freq=None,
+):
if hour is None:
hour = 0
if minute is None:
@@ -935,7 +983,7 @@ def _range_from_fields(year=None, month=None, quarter=None, day=None,
if quarter is not None:
if freq is None:
- freq = 'Q'
+ freq = "Q"
base = libfrequencies.FreqGroup.FR_QTR
else:
base, mult = libfrequencies.get_freq_code(freq)
@@ -951,8 +999,7 @@ def _range_from_fields(year=None, month=None, quarter=None, day=None,
base, mult = libfrequencies.get_freq_code(freq)
arrays = _make_field_arrays(year, month, day, hour, minute, second)
for y, mth, d, h, mn, s in zip(*arrays):
- ordinals.append(libperiod.period_ordinal(
- y, mth, d, h, mn, s, 0, 0, base))
+ ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base))
return np.array(ordinals, dtype=np.int64), freq
@@ -962,11 +1009,15 @@ def _make_field_arrays(*fields):
for x in fields:
if isinstance(x, (list, np.ndarray, ABCSeries)):
if length is not None and len(x) != length:
- raise ValueError('Mismatched Period array lengths')
+ raise ValueError("Mismatched Period array lengths")
elif length is None:
length = len(x)
- arrays = [np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries))
- else np.repeat(x, length) for x in fields]
+ arrays = [
+ np.asarray(x)
+ if isinstance(x, (np.ndarray, list, ABCSeries))
+ else np.repeat(x, length)
+ for x in fields
+ ]
return arrays
diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
index 29cc899fa6a9b..2332da46574c5 100644
--- a/pandas/core/arrays/sparse.py
+++ b/pandas/core/arrays/sparse.py
@@ -21,14 +21,29 @@
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import (
- astype_nansafe, construct_1d_arraylike_from_scalar, find_common_type,
- infer_dtype_from_scalar)
+ astype_nansafe,
+ construct_1d_arraylike_from_scalar,
+ find_common_type,
+ infer_dtype_from_scalar,
+)
from pandas.core.dtypes.common import (
- is_array_like, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal,
- is_integer, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype)
+ is_array_like,
+ is_bool_dtype,
+ is_datetime64_any_dtype,
+ is_dtype_equal,
+ is_integer,
+ is_object_dtype,
+ is_scalar,
+ is_string_dtype,
+ pandas_dtype,
+)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import (
- ABCIndexClass, ABCSeries, ABCSparseArray, ABCSparseSeries)
+ ABCIndexClass,
+ ABCSeries,
+ ABCSparseArray,
+ ABCSparseSeries,
+)
from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
from pandas._typing import Dtype
@@ -82,21 +97,16 @@ class SparseDtype(ExtensionDtype):
-------
None
"""
+
# We include `_is_na_fill_value` in the metadata to avoid hash collisions
# between SparseDtype(float, 0.0) and SparseDtype(float, nan).
# Without is_na_fill_value in the comparison, those would be equal since
# hash(nan) is (sometimes?) 0.
- _metadata = ('_dtype', '_fill_value', '_is_na_fill_value')
+ _metadata = ("_dtype", "_fill_value", "_is_na_fill_value")
- def __init__(
- self,
- dtype: Dtype = np.float64,
- fill_value: Any = None
- ) -> None:
+ def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None:
from pandas.core.dtypes.missing import na_value_for_dtype
- from pandas.core.dtypes.common import (
- pandas_dtype, is_string_dtype, is_scalar
- )
+ from pandas.core.dtypes.common import pandas_dtype, is_string_dtype, is_scalar
if isinstance(dtype, type(self)):
if fill_value is None:
@@ -105,14 +115,15 @@ def __init__(
dtype = pandas_dtype(dtype)
if is_string_dtype(dtype):
- dtype = np.dtype('object')
+ dtype = np.dtype("object")
if fill_value is None:
fill_value = na_value_for_dtype(dtype)
if not is_scalar(fill_value):
- raise ValueError("fill_value must be a scalar. Got {} "
- "instead".format(fill_value))
+ raise ValueError(
+ "fill_value must be a scalar. Got {} " "instead".format(fill_value)
+ )
self._dtype = dtype
self._fill_value = fill_value
@@ -139,9 +150,9 @@ def __eq__(self, other):
# i.e. we want to treat any floating-point NaN as equal, but
# not a floating-point NaN and a datetime NaT.
fill_value = (
- other._is_na_fill_value and
- isinstance(self.fill_value, type(other.fill_value)) or
- isinstance(other.fill_value, type(self.fill_value))
+ other._is_na_fill_value
+ and isinstance(self.fill_value, type(other.fill_value))
+ or isinstance(other.fill_value, type(self.fill_value))
)
else:
fill_value = self.fill_value == other.fill_value
@@ -168,16 +179,19 @@ def fill_value(self):
@property
def _is_na_fill_value(self):
from pandas.core.dtypes.missing import isna
+
return isna(self.fill_value)
@property
def _is_numeric(self):
from pandas.core.dtypes.common import is_object_dtype
+
return not is_object_dtype(self.subtype)
@property
def _is_boolean(self):
from pandas.core.dtypes.common import is_bool_dtype
+
return is_bool_dtype(self.subtype)
@property
@@ -197,7 +211,7 @@ def subtype(self):
@property
def name(self):
- return 'Sparse[{}, {}]'.format(self.subtype.name, self.fill_value)
+ return "Sparse[{}, {}]".format(self.subtype.name, self.fill_value)
def __repr__(self):
return self.name
@@ -241,11 +255,13 @@ def construct_from_string(cls, string):
except Exception:
raise TypeError(msg)
else:
- msg = ("Could not construct SparseDtype from '{}'.\n\nIt "
- "looks like the fill_value in the string is not "
- "the default for the dtype. Non-default fill_values "
- "are not supported. Use the 'SparseDtype()' "
- "constructor instead.")
+ msg = (
+ "Could not construct SparseDtype from '{}'.\n\nIt "
+ "looks like the fill_value in the string is not "
+ "the default for the dtype. Non-default fill_values "
+ "are not supported. Use the 'SparseDtype()' "
+ "constructor instead."
+ )
if has_fill_value and str(result) != string:
raise TypeError(msg.format(string))
return result
@@ -274,30 +290,27 @@ def _parse_subtype(dtype):
ValueError
When the subtype cannot be extracted.
"""
- xpr = re.compile(
- r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$"
- )
+ xpr = re.compile(r"Sparse\[(?P[^,]*)(, )?(?P.*?)?\]$")
m = xpr.match(dtype)
has_fill_value = False
if m:
- subtype = m.groupdict()['subtype']
- has_fill_value = m.groupdict()['fill_value'] or has_fill_value
+ subtype = m.groupdict()["subtype"]
+ has_fill_value = m.groupdict()["fill_value"] or has_fill_value
elif dtype == "Sparse":
- subtype = 'float64'
+ subtype = "float64"
else:
raise ValueError("Cannot parse {}".format(dtype))
return subtype, has_fill_value
@classmethod
def is_dtype(cls, dtype):
- dtype = getattr(dtype, 'dtype', dtype)
- if (isinstance(dtype, str) and
- dtype.startswith("Sparse")):
+ dtype = getattr(dtype, "dtype", dtype)
+ if isinstance(dtype, str) and dtype.startswith("Sparse"):
sub_type, _ = cls._parse_subtype(dtype)
dtype = np.dtype(sub_type)
elif isinstance(dtype, cls):
return True
- return isinstance(dtype, np.dtype) or dtype == 'Sparse'
+ return isinstance(dtype, np.dtype) or dtype == "Sparse"
def update_dtype(self, dtype):
"""
@@ -341,8 +354,7 @@ def update_dtype(self, dtype):
dtype = pandas_dtype(dtype)
if not isinstance(dtype, cls):
- fill_value = astype_nansafe(np.array(self.fill_value),
- dtype).item()
+ fill_value = astype_nansafe(np.array(self.fill_value), dtype).item()
dtype = cls(dtype, fill_value=fill_value)
return dtype
@@ -381,7 +393,7 @@ def _subtype_with_str(self):
# Array
-_sparray_doc_kwargs = dict(klass='SparseArray')
+_sparray_doc_kwargs = dict(klass="SparseArray")
def _get_fill(arr: ABCSparseArray) -> np.ndarray:
@@ -409,10 +421,7 @@ def _get_fill(arr: ABCSparseArray) -> np.ndarray:
def _sparse_array_op(
- left: ABCSparseArray,
- right: ABCSparseArray,
- op: Callable,
- name: str
+ left: ABCSparseArray, right: ABCSparseArray, op: Callable, name: str
) -> Any:
"""
Perform a binary operation between two arrays.
@@ -430,7 +439,7 @@ def _sparse_array_op(
-------
SparseArray
"""
- if name.startswith('__'):
+ if name.startswith("__"):
# For lookups in _libs.sparse we need non-dunder op name
name = name[2:-2]
@@ -454,7 +463,7 @@ def _sparse_array_op(
result_dtype = None
if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = op(left.to_dense(), right.to_dense())
fill = op(_get_fill(left), _get_fill(right))
@@ -463,32 +472,37 @@ def _sparse_array_op(
else:
index = right.sp_index
elif left.sp_index.equals(right.sp_index):
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = op(left.sp_values, right.sp_values)
fill = op(_get_fill(left), _get_fill(right))
index = left.sp_index
else:
- if name[0] == 'r':
+ if name[0] == "r":
left, right = right, left
name = name[1:]
- if name in ('and', 'or') and dtype == 'bool':
- opname = 'sparse_{name}_uint8'.format(name=name)
+ if name in ("and", "or") and dtype == "bool":
+ opname = "sparse_{name}_uint8".format(name=name)
# to make template simple, cast here
left_sp_values = left.sp_values.view(np.uint8)
right_sp_values = right.sp_values.view(np.uint8)
result_dtype = np.bool
else:
- opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype)
+ opname = "sparse_{name}_{dtype}".format(name=name, dtype=dtype)
left_sp_values = left.sp_values
right_sp_values = right.sp_values
sparse_op = getattr(splib, opname)
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result, index, fill = sparse_op(
- left_sp_values, left.sp_index, left.fill_value,
- right_sp_values, right.sp_index, right.fill_value)
+ left_sp_values,
+ left.sp_index,
+ left.fill_value,
+ right_sp_values,
+ right.sp_index,
+ right.fill_value,
+ )
if result_dtype is None:
result_dtype = result.dtype
@@ -500,11 +514,11 @@ def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
"""
wrap op result to have correct dtype
"""
- if name.startswith('__'):
+ if name.startswith("__"):
# e.g. __eq__ --> eq
name = name[2:-2]
- if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
+ if name in ("eq", "ne", "lt", "gt", "le", "ge"):
dtype = np.bool
fill_value = lib.item_from_zerodim(fill_value)
@@ -512,10 +526,9 @@ def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
if is_bool_dtype(dtype):
# fill_value may be np.bool_
fill_value = bool(fill_value)
- return SparseArray(data,
- sparse_index=sparse_index,
- fill_value=fill_value,
- dtype=dtype)
+ return SparseArray(
+ data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype
+ )
class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin):
@@ -583,11 +596,19 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin):
None
"""
- _pandas_ftype = 'sparse'
- _subtyp = 'sparse_array' # register ABCSparseArray
+ _pandas_ftype = "sparse"
+ _subtyp = "sparse_array" # register ABCSparseArray
- def __init__(self, data, sparse_index=None, index=None, fill_value=None,
- kind='integer', dtype=None, copy=False):
+ def __init__(
+ self,
+ data,
+ sparse_index=None,
+ index=None,
+ fill_value=None,
+ kind="integer",
+ dtype=None,
+ copy=False,
+ ):
from pandas.core.internals import SingleBlockManager
if isinstance(data, SingleBlockManager):
@@ -637,9 +658,7 @@ def __init__(self, data, sparse_index=None, index=None, fill_value=None,
npoints = sparse_index.length
dtype = infer_dtype_from_scalar(data)[0]
- data = construct_1d_arraylike_from_scalar(
- data, npoints, dtype
- )
+ data = construct_1d_arraylike_from_scalar(data, npoints, dtype)
if dtype is not None:
dtype = pandas_dtype(dtype)
@@ -654,6 +673,7 @@ def __init__(self, data, sparse_index=None, index=None, fill_value=None,
try:
# probably shared code in sanitize_series
from pandas.core.internals.construction import sanitize_array
+
data = sanitize_array(data, index=None)
except ValueError:
# NumPy may raise a ValueError on data like [1, []]
@@ -685,19 +705,17 @@ def __init__(self, data, sparse_index=None, index=None, fill_value=None,
else:
sparse_values = np.asarray(data, dtype=dtype)
if len(sparse_values) != sparse_index.npoints:
- raise AssertionError("Non array-like type {type} must "
- "have the same length as the index"
- .format(type=type(sparse_values)))
+ raise AssertionError(
+ "Non array-like type {type} must "
+ "have the same length as the index".format(type=type(sparse_values))
+ )
self._sparse_index = sparse_index
self._sparse_values = sparse_values
self._dtype = SparseDtype(sparse_values.dtype, fill_value)
@classmethod
def _simple_new(
- cls,
- sparse_array: np.ndarray,
- sparse_index: SparseIndex,
- dtype: SparseDtype
+ cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype
) -> ABCSparseArray:
new = cls([])
new._sparse_index = sparse_index
@@ -736,9 +754,7 @@ def from_spmatrix(cls, data):
length, ncol = data.shape
if ncol != 1:
- raise ValueError(
- "'data' must have a single column, not '{}'".format(ncol)
- )
+ raise ValueError("'data' must have a single column, not '{}'".format(ncol))
# our sparse index classes require that the positions be strictly
# increasing. So we need to sort loc, and arr accordingly.
@@ -769,7 +785,7 @@ def __array__(self, dtype=None, copy=True):
# a datetime64 with pandas NaT.
if fill_value is NaT:
# Can't put pd.NaT in a datetime64[ns]
- fill_value = np.datetime64('NaT')
+ fill_value = np.datetime64("NaT")
try:
dtype = np.result_type(self.sp_values.dtype, type(fill_value))
except TypeError:
@@ -840,9 +856,9 @@ def kind(self):
The kind of sparse index for this array. One of {'integer', 'block'}.
"""
if isinstance(self.sp_index, IntIndex):
- return 'integer'
+ return "integer"
else:
- return 'block'
+ return "block"
@property
def _valid_sp_values(self):
@@ -906,17 +922,18 @@ def values(self):
msg = (
"The SparseArray.values attribute is deprecated and will be "
"removed in a future version. You can use `np.asarray(...)` or "
- "the `.to_dense()` method instead.")
+ "the `.to_dense()` method instead."
+ )
warnings.warn(msg, FutureWarning, stacklevel=2)
return self.to_dense()
def isna(self):
from pandas import isna
+
# If null fill value, we want SparseDtype[bool, true]
# to preserve the same memory usage.
dtype = SparseDtype(bool, self._null_fill_value)
- return type(self)._simple_new(isna(self.sp_values),
- self.sp_index, dtype)
+ return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
def fillna(self, value=None, method=None, limit=None):
"""
@@ -951,15 +968,15 @@ def fillna(self, value=None, method=None, limit=None):
When ``self.fill_value`` is not NA, the result dtype will be
``self.dtype``. Again, this preserves the amount of memory used.
"""
- if ((method is None and value is None) or
- (method is not None and value is not None)):
+ if (method is None and value is None) or (
+ method is not None and value is not None
+ ):
raise ValueError("Must specify one of 'method' or 'value'.")
elif method is not None:
msg = "fillna with 'method' requires high memory usage."
warnings.warn(msg, PerformanceWarning)
- filled = interpolate_2d(np.asarray(self), method=method,
- limit=limit)
+ filled = interpolate_2d(np.asarray(self), method=method, limit=limit)
return type(self)(filled, fill_value=self.fill_value)
else:
@@ -990,15 +1007,14 @@ def shift(self, periods=1, fill_value=None):
arr = self
empty = self._from_sequence(
- [fill_value] * min(abs(periods), len(self)),
- dtype=arr.dtype
+ [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype
)
if periods > 0:
a = empty
b = arr[:-periods]
else:
- a = arr[abs(periods):]
+ a = arr[abs(periods) :]
b = empty
return arr._concat_same_type([a, b])
@@ -1037,8 +1053,7 @@ def factorize(self, na_sentinel=-1):
# ExtensionArray.factorize -> Tuple[EA, EA]
# Given that we have to return a dense array of labels, why bother
# implementing an efficient factorize?
- labels, uniques = algos.factorize(np.asarray(self),
- na_sentinel=na_sentinel)
+ labels, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
uniques = SparseArray(uniques, dtype=self.dtype)
return labels, uniques
@@ -1057,8 +1072,7 @@ def value_counts(self, dropna=True):
"""
from pandas import Index, Series
- keys, counts = algos._value_counts_arraylike(self.sp_values,
- dropna=dropna)
+ keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna)
fcounts = self.sp_index.ngaps
if fcounts > 0:
if self._null_fill_value and dropna:
@@ -1115,7 +1129,7 @@ def __getitem__(self, key):
if com.is_bool_indexer(key) and len(self) == len(key):
return self.take(np.arange(len(key), dtype=np.int32)[key])
- elif hasattr(key, '__len__'):
+ elif hasattr(key, "__len__"):
return self.take(key)
else:
raise ValueError("Cannot slice with '{}'".format(key))
@@ -1128,7 +1142,7 @@ def _get_val_at(self, loc):
loc += n
if loc >= n or loc < 0:
- raise IndexError('Out of bounds access')
+ raise IndexError("Out of bounds access")
sp_loc = self.sp_index.lookup(loc)
if sp_loc == -1:
@@ -1138,30 +1152,32 @@ def _get_val_at(self, loc):
def take(self, indices, allow_fill=False, fill_value=None):
if is_scalar(indices):
- raise ValueError("'indices' must be an array, not a "
- "scalar '{}'.".format(indices))
+ raise ValueError(
+ "'indices' must be an array, not a " "scalar '{}'.".format(indices)
+ )
indices = np.asarray(indices, dtype=np.int32)
if indices.size == 0:
result = []
- kwargs = {'dtype': self.dtype}
+ kwargs = {"dtype": self.dtype}
elif allow_fill:
result = self._take_with_fill(indices, fill_value=fill_value)
kwargs = {}
else:
result = self._take_without_fill(indices)
- kwargs = {'dtype': self.dtype}
+ kwargs = {"dtype": self.dtype}
- return type(self)(result, fill_value=self.fill_value, kind=self.kind,
- **kwargs)
+ return type(self)(result, fill_value=self.fill_value, kind=self.kind, **kwargs)
def _take_with_fill(self, indices, fill_value=None):
if fill_value is None:
fill_value = self.dtype.na_value
if indices.min() < -1:
- raise ValueError("Invalid value in 'indices'. Must be between -1 "
- "and the length of the array.")
+ raise ValueError(
+ "Invalid value in 'indices'. Must be between -1 "
+ "and the length of the array."
+ )
if indices.max() >= len(self):
raise IndexError("out of bounds value in 'indices'.")
@@ -1174,15 +1190,17 @@ def _take_with_fill(self, indices, fill_value=None):
taken.fill(fill_value)
return taken
else:
- raise IndexError('cannot do a non-empty take from an empty '
- 'axes.')
+ raise IndexError("cannot do a non-empty take from an empty " "axes.")
sp_indexer = self.sp_index.lookup_array(indices)
if self.sp_index.npoints == 0:
# Avoid taking from the empty self.sp_values
- taken = np.full(sp_indexer.shape, fill_value=fill_value,
- dtype=np.result_type(type(fill_value)))
+ taken = np.full(
+ sp_indexer.shape,
+ fill_value=fill_value,
+ dtype=np.result_type(type(fill_value)),
+ )
else:
taken = self.sp_values.take(sp_indexer)
@@ -1203,8 +1221,7 @@ def _take_with_fill(self, indices, fill_value=None):
result_type = taken.dtype
if m0.any():
- result_type = np.result_type(result_type,
- type(self.fill_value))
+ result_type = np.result_type(result_type, type(self.fill_value))
taken = taken.astype(result_type)
taken[old_fill_indices] = self.fill_value
@@ -1223,8 +1240,7 @@ def _take_without_fill(self, indices):
if (indices.max() >= n) or (indices.min() < -n):
if n == 0:
- raise IndexError("cannot do a non-empty take from an "
- "empty axes.")
+ raise IndexError("cannot do a non-empty take from an " "empty axes.")
else:
raise IndexError("out of bounds value in 'indices'.")
@@ -1234,16 +1250,17 @@ def _take_without_fill(self, indices):
if self.sp_index.npoints == 0:
# edge case in take...
# I think just return
- out = np.full(indices.shape, self.fill_value,
- dtype=np.result_type(type(self.fill_value)))
- arr, sp_index, fill_value = make_sparse(out,
- fill_value=self.fill_value)
- return type(self)(arr, sparse_index=sp_index,
- fill_value=fill_value)
+ out = np.full(
+ indices.shape,
+ self.fill_value,
+ dtype=np.result_type(type(self.fill_value)),
+ )
+ arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value)
+ return type(self)(arr, sparse_index=sp_index, fill_value=fill_value)
sp_indexer = self.sp_index.lookup_array(indices)
taken = self.sp_values.take(sp_indexer)
- fillable = (sp_indexer < 0)
+ fillable = sp_indexer < 0
if fillable.any():
# TODO: may need to coerce array to fill value
@@ -1259,9 +1276,7 @@ def searchsorted(self, v, side="left", sorter=None):
if not is_scalar(v):
v = np.asarray(v)
v = np.asarray(v)
- return np.asarray(self, dtype=self.dtype.subtype).searchsorted(
- v, side, sorter
- )
+ return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)
def copy(self):
values = self.sp_values.copy()
@@ -1276,11 +1291,13 @@ def _concat_same_type(cls, to_concat):
# np.nan isn't a singleton, so we may end up with multiple
# NaNs here, so we ignore tha all NA case too.
if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
- warnings.warn("Concatenating sparse arrays with multiple fill "
- "values: '{}'. Picking the first and "
- "converting the rest.".format(fill_values),
- PerformanceWarning,
- stacklevel=6)
+ warnings.warn(
+ "Concatenating sparse arrays with multiple fill "
+ "values: '{}'. Picking the first and "
+ "converting the rest.".format(fill_values),
+ PerformanceWarning,
+ stacklevel=6,
+ )
keep = to_concat[0]
to_concat2 = [keep]
@@ -1295,9 +1312,9 @@ def _concat_same_type(cls, to_concat):
if to_concat:
sp_kind = to_concat[0].kind
else:
- sp_kind = 'integer'
+ sp_kind = "integer"
- if sp_kind == 'integer':
+ if sp_kind == "integer":
indices = []
for arr in to_concat:
@@ -1396,15 +1413,11 @@ def astype(self, dtype=None, copy=True):
"""
dtype = self.dtype.update_dtype(dtype)
subtype = dtype._subtype_with_str
- sp_values = astype_nansafe(self.sp_values,
- subtype,
- copy=copy)
+ sp_values = astype_nansafe(self.sp_values, subtype, copy=copy)
if sp_values is self.sp_values and copy:
sp_values = sp_values.copy()
- return self._simple_new(sp_values,
- self.sp_index,
- dtype)
+ return self._simple_new(sp_values, self.sp_index, dtype)
def map(self, mapper):
"""
@@ -1456,8 +1469,7 @@ def map(self, mapper):
fill_value = mapper(self.fill_value)
sp_values = [mapper(x) for x in self.sp_values]
- return type(self)(sp_values, sparse_index=self.sp_index,
- fill_value=fill_value)
+ return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value)
def to_dense(self):
"""
@@ -1480,7 +1492,9 @@ def get_values(self):
warnings.warn(
"The 'get_values' method is deprecated and will be removed in a "
"future version. Use the 'to_dense' method instead.",
- FutureWarning, stacklevel=2)
+ FutureWarning,
+ stacklevel=2,
+ )
return self._internal_get_values()
_internal_get_values = to_dense
@@ -1504,9 +1518,9 @@ def __setstate__(self, state):
def nonzero(self):
if self.fill_value == 0:
- return self.sp_index.to_int_index().indices,
+ return (self.sp_index.to_int_index().indices,)
else:
- return self.sp_index.to_int_index().indices[self.sp_values != 0],
+ return (self.sp_index.to_int_index().indices[self.sp_values != 0],)
# ------------------------------------------------------------------------
# Reductions
@@ -1516,8 +1530,11 @@ def _reduce(self, name, skipna=True, **kwargs):
method = getattr(self, name, None)
if method is None:
- raise TypeError("cannot perform {name} with type {dtype}".format(
- name=name, dtype=self.dtype))
+ raise TypeError(
+ "cannot perform {name} with type {dtype}".format(
+ name=name, dtype=self.dtype
+ )
+ )
if skipna:
arr = self
@@ -1528,9 +1545,9 @@ def _reduce(self, name, skipna=True, **kwargs):
# They should only be present when called via pandas, so do it here.
# instead of in `any` / `all` (which will raise if they're present,
# thanks to nv.validate
- kwargs.pop('filter_type', None)
- kwargs.pop('numeric_only', None)
- kwargs.pop('op', None)
+ kwargs.pop("filter_type", None)
+ kwargs.pop("numeric_only", None)
+ kwargs.pop("op", None)
return getattr(arr, name)(**kwargs)
def all(self, axis=None, *args, **kwargs):
@@ -1618,8 +1635,11 @@ def cumsum(self, axis=0, *args, **kwargs):
if not self._null_fill_value:
return SparseArray(self.to_dense()).cumsum()
- return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index,
- fill_value=self.fill_value)
+ return SparseArray(
+ self.sp_values.cumsum(),
+ sparse_index=self.sp_index,
+ fill_value=self.fill_value,
+ )
def mean(self, axis=0, *args, **kwargs):
"""
@@ -1660,7 +1680,7 @@ def T(self):
_HANDLED_TYPES = (np.ndarray, numbers.Number)
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
- out = kwargs.get('out', ())
+ out = kwargs.get("out", ())
for x in inputs + out:
if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
@@ -1668,7 +1688,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs)
+ self, ufunc, method, *inputs, **kwargs
+ )
if result is not NotImplemented:
return result
@@ -1680,19 +1701,18 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
if isinstance(sp_values, tuple):
# multiple outputs. e.g. modf
arrays = tuple(
- self._simple_new(sp_value,
- self.sp_index,
- SparseDtype(sp_value.dtype, fv))
+ self._simple_new(
+ sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)
+ )
for sp_value, fv in zip(sp_values, fill_value)
)
return arrays
- return self._simple_new(sp_values,
- self.sp_index,
- SparseDtype(sp_values.dtype, fill_value))
+ return self._simple_new(
+ sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)
+ )
- result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs],
- **kwargs)
+ result = getattr(ufunc, method)(*[np.asarray(x) for x in inputs], **kwargs)
if out:
if len(out) == 1:
out = out[0]
@@ -1700,7 +1720,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
if type(result) is tuple:
return tuple(type(self)(x) for x in result)
- elif method == 'at':
+ elif method == "at":
# no return value
return None
else:
@@ -1721,7 +1741,7 @@ def sparse_unary_method(self):
dtype = SparseDtype(values.dtype, fill_value)
return cls._simple_new(values, self.sp_index, dtype)
- name = '__{name}__'.format(name=op.__name__)
+ name = "__{name}__".format(name=op.__name__)
return compat.set_function_name(sparse_unary_method, name, cls)
@classmethod
@@ -1737,34 +1757,41 @@ def sparse_arithmetic_method(self, other):
return _sparse_array_op(self, other, op, op_name)
elif is_scalar(other):
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
fill = op(_get_fill(self), np.asarray(other))
result = op(self.sp_values, other)
- if op_name == 'divmod':
+ if op_name == "divmod":
left, right = result
lfill, rfill = fill
- return (_wrap_result(op_name, left, self.sp_index, lfill),
- _wrap_result(op_name, right, self.sp_index, rfill))
+ return (
+ _wrap_result(op_name, left, self.sp_index, lfill),
+ _wrap_result(op_name, right, self.sp_index, rfill),
+ )
return _wrap_result(op_name, result, self.sp_index, fill)
else:
other = np.asarray(other)
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
# TODO: delete sparse stuff in core/ops.py
# TODO: look into _wrap_result
if len(self) != len(other):
raise AssertionError(
- ("length mismatch: {self} vs. {other}".format(
- self=len(self), other=len(other))))
+ (
+ "length mismatch: {self} vs. {other}".format(
+ self=len(self), other=len(other)
+ )
+ )
+ )
if not isinstance(other, SparseArray):
- dtype = getattr(other, 'dtype', None)
- other = SparseArray(other, fill_value=self.fill_value,
- dtype=dtype)
+ dtype = getattr(other, "dtype", None)
+ other = SparseArray(
+ other, fill_value=self.fill_value, dtype=dtype
+ )
return _sparse_array_op(self, other, op, op_name)
- name = '__{name}__'.format(name=op.__name__)
+ name = "__{name}__".format(name=op.__name__)
return compat.set_function_name(sparse_arithmetic_method, name, cls)
@classmethod
@@ -1772,7 +1799,7 @@ def _create_comparison_method(cls, op):
def cmp_method(self, other):
op_name = op.__name__
- if op_name in {'and_', 'or_'}:
+ if op_name in {"and_", "or_"}:
op_name = op_name[:-1]
if isinstance(other, (ABCSeries, ABCIndexClass)):
@@ -1786,24 +1813,28 @@ def cmp_method(self, other):
if isinstance(other, np.ndarray):
# TODO: make this more flexible than just ndarray...
if len(self) != len(other):
- raise AssertionError("length mismatch: {self} vs. {other}"
- .format(self=len(self),
- other=len(other)))
+ raise AssertionError(
+ "length mismatch: {self} vs. {other}".format(
+ self=len(self), other=len(other)
+ )
+ )
other = SparseArray(other, fill_value=self.fill_value)
if isinstance(other, SparseArray):
return _sparse_array_op(self, other, op, op_name)
else:
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
fill_value = op(self.fill_value, other)
result = op(self.sp_values, other)
- return type(self)(result,
- sparse_index=self.sp_index,
- fill_value=fill_value,
- dtype=np.bool_)
+ return type(self)(
+ result,
+ sparse_index=self.sp_index,
+ fill_value=fill_value,
+ dtype=np.bool_,
+ )
- name = '__{name}__'.format(name=op.__name__)
+ name = "__{name}__".format(name=op.__name__)
return compat.set_function_name(cmp_method, name, cls)
@classmethod
@@ -1822,10 +1853,11 @@ def _add_comparison_ops(cls):
# Formatting
# -----------
def __repr__(self):
- return '{self}\nFill: {fill}\n{index}'.format(
+ return "{self}\nFill: {fill}\n{index}".format(
self=printing.pprint_thing(self),
fill=printing.pprint_thing(self.fill_value),
- index=printing.pprint_thing(self.sp_index))
+ index=printing.pprint_thing(self.sp_index),
+ )
def _formatter(self, boxed=False):
# Defer to the formatter from the GenericArrayFormatter calling us.
@@ -1842,12 +1874,12 @@ def _maybe_to_dense(obj):
"""
try to convert to dense
"""
- if hasattr(obj, 'to_dense'):
+ if hasattr(obj, "to_dense"):
return obj.to_dense()
return obj
-def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False):
+def make_sparse(arr, kind="block", fill_value=None, dtype=None, copy=False):
"""
Convert ndarray to sparse format
@@ -1904,13 +1936,13 @@ def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False):
def _make_index(length, indices, kind):
- if kind == 'block' or isinstance(kind, BlockIndex):
+ if kind == "block" or isinstance(kind, BlockIndex):
locs, lens = splib.get_blocks(indices)
index = BlockIndex(length, locs, lens)
- elif kind == 'integer' or isinstance(kind, IntIndex):
+ elif kind == "integer" or isinstance(kind, IntIndex):
index = IntIndex(length, indices)
else: # pragma: no cover
- raise ValueError('must be block or integer type')
+ raise ValueError("must be block or integer type")
return index
@@ -1929,9 +1961,9 @@ def _validate(self, data):
raise NotImplementedError
-@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
- 'sp_values'],
- typ='property')
+@delegate_names(
+ SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
+)
class SparseAccessor(BaseAccessor, PandasDelegate):
"""
Accessor for SparseSparse from other sparse matrix data types.
@@ -1945,9 +1977,9 @@ def _delegate_property_get(self, name, *args, **kwargs):
return getattr(self._parent.array, name)
def _delegate_method(self, name, *args, **kwargs):
- if name == 'from_coo':
+ if name == "from_coo":
return self.from_coo(*args, **kwargs)
- elif name == 'to_coo':
+ elif name == "to_coo":
return self.to_coo(*args, **kwargs)
else:
raise ValueError
@@ -1995,13 +2027,12 @@ def from_coo(cls, A, dense_index=False):
from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series
from pandas import Series
- result = _coo_to_sparse_series(A, dense_index=dense_index,
- sparse_series=False)
+ result = _coo_to_sparse_series(A, dense_index=dense_index, sparse_series=False)
result = Series(result.array, index=result.index, copy=False)
return result
- def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
+ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
"""
Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex.
@@ -2051,10 +2082,9 @@ def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
"""
from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo
- A, rows, columns = _sparse_series_to_coo(self._parent,
- row_levels,
- column_levels,
- sort_labels=sort_labels)
+ A, rows, columns = _sparse_series_to_coo(
+ self._parent, row_levels, column_levels, sort_labels=sort_labels
+ )
return A, rows, columns
def to_dense(self):
@@ -2084,9 +2114,12 @@ def to_dense(self):
dtype: int64
"""
from pandas import Series
- return Series(self._parent.array.to_dense(),
- index=self._parent.index,
- name=self._parent.name)
+
+ return Series(
+ self._parent.array.to_dense(),
+ index=self._parent.index,
+ name=self._parent.name,
+ )
class SparseFrameAccessor(BaseAccessor, PandasDelegate):
@@ -2136,10 +2169,7 @@ def from_spmatrix(cls, data, index=None, columns=None):
data = data.tocsc()
index, columns = cls._prep_index(data, index, columns)
- sparrays = [
- SparseArray.from_spmatrix(data[:, i])
- for i in range(data.shape[1])
- ]
+ sparrays = [SparseArray.from_spmatrix(data[:, i]) for i in range(data.shape[1])]
data = dict(enumerate(sparrays))
result = DataFrame(data, index=index)
result.columns = columns
@@ -2167,11 +2197,8 @@ def to_dense(self):
"""
from pandas import DataFrame
- data = {k: v.array.to_dense()
- for k, v in self._parent.items()}
- return DataFrame(data,
- index=self._parent.index,
- columns=self._parent.columns)
+ data = {k: v.array.to_dense() for k, v in self._parent.items()}
+ return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
def to_coo(self):
"""
@@ -2221,8 +2248,7 @@ def density(self) -> float:
Ratio of non-sparse points to total (dense) data points
represented in the DataFrame.
"""
- return np.mean([column.array.density
- for _, column in self._parent.items()])
+ return np.mean([column.array.density for _, column in self._parent.items()])
@staticmethod
def _prep_index(data, index, columns):
@@ -2235,9 +2261,13 @@ def _prep_index(data, index, columns):
columns = ibase.default_index(K)
if len(columns) != K:
- raise ValueError('Column length mismatch: {columns} vs. {K}'
- .format(columns=len(columns), K=K))
+ raise ValueError(
+ "Column length mismatch: {columns} vs. {K}".format(
+ columns=len(columns), K=K
+ )
+ )
if len(index) != N:
- raise ValueError('Index length mismatch: {index} vs. {N}'
- .format(index=len(index), N=N))
+ raise ValueError(
+ "Index length mismatch: {index} vs. {N}".format(index=len(index), N=N)
+ )
return index, columns
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
index 50bc8d6d3ae6b..9d622d92e0979 100644
--- a/pandas/core/arrays/timedeltas.py
+++ b/pandas/core/arrays/timedeltas.py
@@ -9,18 +9,36 @@
from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT
from pandas._libs.tslibs.fields import get_timedelta_field
from pandas._libs.tslibs.timedeltas import (
- array_to_timedelta64, parse_timedelta_unit, precision_from_unit)
+ array_to_timedelta64,
+ parse_timedelta_unit,
+ precision_from_unit,
+)
import pandas.compat as compat
from pandas.util._decorators import Appender
from pandas.core.dtypes.common import (
- _NS_DTYPE, _TD_DTYPE, ensure_int64, is_datetime64_dtype, is_dtype_equal,
- is_float_dtype, is_integer_dtype, is_list_like, is_object_dtype, is_scalar,
- is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype,
- pandas_dtype)
+ _NS_DTYPE,
+ _TD_DTYPE,
+ ensure_int64,
+ is_datetime64_dtype,
+ is_dtype_equal,
+ is_float_dtype,
+ is_integer_dtype,
+ is_list_like,
+ is_object_dtype,
+ is_scalar,
+ is_string_dtype,
+ is_timedelta64_dtype,
+ is_timedelta64_ns_dtype,
+ pandas_dtype,
+)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCIndexClass, ABCSeries, ABCTimedeltaIndex)
+ ABCDataFrame,
+ ABCIndexClass,
+ ABCSeries,
+ ABCTimedeltaIndex,
+)
from pandas.core.dtypes.missing import isna
from pandas.core import ops
@@ -44,8 +62,9 @@ def f(self):
values = self.asi8
result = get_timedelta_field(values, alias)
if self._hasnans:
- result = self._maybe_mask_results(result, fill_value=None,
- convert='float64')
+ result = self._maybe_mask_results(
+ result, fill_value=None, convert="float64"
+ )
return result
@@ -58,8 +77,8 @@ def _td_array_cmp(cls, op):
"""
Wrap comparison operations to convert timedelta-like to timedelta64
"""
- opname = '__{name}__'.format(name=op.__name__)
- nat_result = opname == '__ne__'
+ opname = "__{name}__".format(name=op.__name__)
+ nat_result = opname == "__ne__"
def wrapper(self, other):
other = lib.item_from_zerodim(other)
@@ -73,7 +92,7 @@ def wrapper(self, other):
# failed to parse as timedelta
return ops.invalid_comparison(self, other, op)
- result = op(self.view('i8'), other.value)
+ result = op(self.view("i8"), other.value)
if isna(other):
result.fill(nat_result)
@@ -89,7 +108,7 @@ def wrapper(self, other):
except (ValueError, TypeError):
return ops.invalid_comparison(self, other, op)
- result = op(self.view('i8'), other.view('i8'))
+ result = op(self.view("i8"), other.view("i8"))
result = com.values_from_object(result)
o_mask = np.array(isna(other))
@@ -136,24 +155,30 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps):
-------
None
"""
+
_typ = "timedeltaarray"
_scalar_type = Timedelta
__array_priority__ = 1000
# define my properties & methods for delegation
_other_ops = [] # type: List[str]
_bool_ops = [] # type: List[str]
- _object_ops = ['freq']
- _field_ops = ['days', 'seconds', 'microseconds', 'nanoseconds']
+ _object_ops = ["freq"]
+ _field_ops = ["days", "seconds", "microseconds", "nanoseconds"]
_datetimelike_ops = _field_ops + _object_ops + _bool_ops
- _datetimelike_methods = ["to_pytimedelta", "total_seconds",
- "round", "floor", "ceil"]
+ _datetimelike_methods = [
+ "to_pytimedelta",
+ "total_seconds",
+ "round",
+ "floor",
+ "ceil",
+ ]
# Needed so that NaT.__richcmp__(DateTimeArray) operates pointwise
ndim = 1
@property
def _box_func(self):
- return lambda x: Timedelta(x, unit='ns')
+ return lambda x: Timedelta(x, unit="ns")
@property
def dtype(self):
@@ -199,7 +224,7 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False):
if values.ndim != 1:
raise ValueError("Only 1-dimensional input arrays are supported.")
- if values.dtype == 'i8':
+ if values.dtype == "i8":
# for compat with datetime/timedelta/period shared methods,
# we can sometimes get here with int64 values. These represent
# nanosecond UTC (or tz-naive) unix timestamps
@@ -239,15 +264,13 @@ def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE):
return result
@classmethod
- def _from_sequence(cls, data, dtype=_TD_DTYPE, copy=False,
- freq=None, unit=None):
+ def _from_sequence(cls, data, dtype=_TD_DTYPE, copy=False, freq=None, unit=None):
if dtype:
_validate_td64_dtype(dtype)
freq, freq_infer = dtl.maybe_infer_freq(freq)
data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit)
- freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq,
- freq_infer)
+ freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer)
result = cls._simple_new(data, freq=freq)
@@ -267,12 +290,13 @@ def _generate_range(cls, start, end, periods, freq, closed=None):
periods = dtl.validate_periods(periods)
if freq is None and any(x is None for x in [periods, start, end]):
- raise ValueError('Must provide freq argument if no data is '
- 'supplied')
+ raise ValueError("Must provide freq argument if no data is " "supplied")
if com.count_not_none(start, end, periods, freq) != 3:
- raise ValueError('Of the four parameters: start, end, periods, '
- 'and freq, exactly three must be specified')
+ raise ValueError(
+ "Of the four parameters: start, end, periods, "
+ "and freq, exactly three must be specified"
+ )
if start is not None:
start = Timedelta(start)
@@ -282,15 +306,16 @@ def _generate_range(cls, start, end, periods, freq, closed=None):
if start is None and end is None:
if closed is not None:
- raise ValueError("Closed has to be None if not both of start"
- "and end are defined")
+ raise ValueError(
+ "Closed has to be None if not both of start" "and end are defined"
+ )
left_closed, right_closed = dtl.validate_endpoints(closed)
if freq is not None:
index = _generate_regular_range(start, end, periods, freq)
else:
- index = np.linspace(start.value, end.value, periods).astype('i8')
+ index = np.linspace(start.value, end.value, periods).astype("i8")
if not left_closed:
index = index[1:]
@@ -328,8 +353,10 @@ def _validate_fill_value(self, fill_value):
elif isinstance(fill_value, (timedelta, np.timedelta64, Tick)):
fill_value = Timedelta(fill_value).value
else:
- raise ValueError("'fill_value' should be a Timedelta. "
- "Got '{got}'.".format(got=fill_value))
+ raise ValueError(
+ "'fill_value' should be a Timedelta. "
+ "Got '{got}'.".format(got=fill_value)
+ )
return fill_value
def astype(self, dtype, copy=True):
@@ -346,12 +373,12 @@ def astype(self, dtype, copy=True):
if self._hasnans:
# avoid double-copying
result = self._data.astype(dtype, copy=False)
- values = self._maybe_mask_results(result,
- fill_value=None,
- convert='float64')
+ values = self._maybe_mask_results(
+ result, fill_value=None, convert="float64"
+ )
return values
result = self._data.astype(dtype, copy=copy)
- return result.astype('i8')
+ return result.astype("i8")
elif is_timedelta64_ns_dtype(dtype):
if copy:
return self.copy()
@@ -363,9 +390,10 @@ def astype(self, dtype, copy=True):
def _formatter(self, boxed=False):
from pandas.io.formats.format import _get_format_timedelta64
+
return _get_format_timedelta64(self, box=True)
- def _format_native_types(self, na_rep='NaT', date_format=None):
+ def _format_native_types(self, na_rep="NaT", date_format=None):
from pandas.io.formats.format import _get_format_timedelta64
formatter = _get_format_timedelta64(self._data, na_rep)
@@ -378,9 +406,11 @@ def _format_native_types(self, na_rep='NaT', date_format=None):
def _add_offset(self, other):
assert not isinstance(other, Tick)
- raise TypeError("cannot add the type {typ} to a {cls}"
- .format(typ=type(other).__name__,
- cls=type(self).__name__))
+ raise TypeError(
+ "cannot add the type {typ} to a {cls}".format(
+ typ=type(other).__name__, cls=type(self).__name__
+ )
+ )
def _add_delta(self, delta):
"""
@@ -397,7 +427,7 @@ def _add_delta(self, delta):
result : TimedeltaArray
"""
new_values = super()._add_delta(delta)
- return type(self)._from_sequence(new_values, freq='infer')
+ return type(self)._from_sequence(new_values, freq="infer")
def _add_datetime_arraylike(self, other):
"""
@@ -406,6 +436,7 @@ def _add_datetime_arraylike(self, other):
if isinstance(other, np.ndarray):
# At this point we have already checked that dtype is datetime64
from pandas.core.arrays import DatetimeArray
+
other = DatetimeArray(other)
# defer to implementation in DatetimeArray
@@ -420,12 +451,11 @@ def _add_datetimelike_scalar(self, other):
if other is NaT:
# In this case we specifically interpret NaT as a datetime, not
# the timedelta interpretation we would get by returning self + NaT
- result = self.asi8.view('m8[ms]') + NaT.to_datetime64()
+ result = self.asi8.view("m8[ms]") + NaT.to_datetime64()
return DatetimeArray(result)
i8 = self.asi8
- result = checked_add_with_arr(i8, other.value,
- arr_mask=self._isnan)
+ result = checked_add_with_arr(i8, other.value, arr_mask=self._isnan)
result = self._maybe_mask_results(result)
dtype = DatetimeTZDtype(tz=other.tz) if other.tz else _NS_DTYPE
return DatetimeArray(result, dtype=dtype, freq=self.freq)
@@ -438,8 +468,11 @@ def _addsub_offset_array(self, other, op):
# which we re-raise as TypeError
return super()._addsub_offset_array(other, op)
except AttributeError:
- raise TypeError("Cannot add/subtract non-tick DateOffset to {cls}"
- .format(cls=type(self).__name__))
+ raise TypeError(
+ "Cannot add/subtract non-tick DateOffset to {cls}".format(
+ cls=type(self).__name__
+ )
+ )
def __mul__(self, other):
other = lib.item_from_zerodim(other)
@@ -546,9 +579,11 @@ def __rtruediv__(self, other):
return other / self._data
elif lib.is_scalar(other):
- raise TypeError("Cannot divide {typ} by {cls}"
- .format(typ=type(other).__name__,
- cls=type(self).__name__))
+ raise TypeError(
+ "Cannot divide {typ} by {cls}".format(
+ typ=type(other).__name__, cls=type(self).__name__
+ )
+ )
if not hasattr(other, "dtype"):
# e.g. list, tuple
@@ -569,9 +604,11 @@ def __rtruediv__(self, other):
return np.array(result)
else:
- raise TypeError("Cannot divide {dtype} data by {cls}"
- .format(dtype=other.dtype,
- cls=type(self).__name__))
+ raise TypeError(
+ "Cannot divide {dtype} data by {cls}".format(
+ dtype=other.dtype, cls=type(self).__name__
+ )
+ )
def __floordiv__(self, other):
if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
@@ -599,7 +636,7 @@ def __floordiv__(self, other):
if self.freq is not None:
# Note: freq gets division, not floor-division
freq = self.freq / other
- return type(self)(result.view('m8[ns]'), freq=freq)
+ return type(self)(result.view("m8[ns]"), freq=freq)
if not hasattr(other, "dtype"):
# list, tuple
@@ -622,7 +659,7 @@ def __floordiv__(self, other):
elif is_object_dtype(other):
result = [self[n] // other[n] for n in range(len(self))]
result = np.array(result)
- if lib.infer_dtype(result, skipna=False) == 'timedelta':
+ if lib.infer_dtype(result, skipna=False) == "timedelta":
result, _ = sequence_to_td64ns(result)
return type(self)(result)
return result
@@ -633,8 +670,11 @@ def __floordiv__(self, other):
else:
dtype = getattr(other, "dtype", type(other).__name__)
- raise TypeError("Cannot divide {typ} by {cls}"
- .format(typ=dtype, cls=type(self).__name__))
+ raise TypeError(
+ "Cannot divide {typ} by {cls}".format(
+ typ=dtype, cls=type(self).__name__
+ )
+ )
def __rfloordiv__(self, other):
if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
@@ -654,9 +694,11 @@ def __rfloordiv__(self, other):
result = other.__floordiv__(self._data)
return result
- raise TypeError("Cannot divide {typ} by {cls}"
- .format(typ=type(other).__name__,
- cls=type(self).__name__))
+ raise TypeError(
+ "Cannot divide {typ} by {cls}".format(
+ typ=type(other).__name__, cls=type(self).__name__
+ )
+ )
if not hasattr(other, "dtype"):
# list, tuple
@@ -683,8 +725,11 @@ def __rfloordiv__(self, other):
else:
dtype = getattr(other, "dtype", type(other).__name__)
- raise TypeError("Cannot divide {typ} by {cls}"
- .format(typ=dtype, cls=type(self).__name__))
+ raise TypeError(
+ "Cannot divide {typ} by {cls}".format(
+ typ=dtype, cls=type(self).__name__
+ )
+ )
def __mod__(self, other):
# Note: This is a naive implementation, can likely be optimized
@@ -813,17 +858,22 @@ def to_pytimedelta(self):
"""
return tslibs.ints_to_pytimedelta(self.asi8)
- days = _field_accessor("days", "days",
- "Number of days for each element.")
- seconds = _field_accessor("seconds", "seconds",
- "Number of seconds (>= 0 and less than 1 day) "
- "for each element.")
- microseconds = _field_accessor("microseconds", "microseconds",
- "Number of microseconds (>= 0 and less "
- "than 1 second) for each element.")
- nanoseconds = _field_accessor("nanoseconds", "nanoseconds",
- "Number of nanoseconds (>= 0 and less "
- "than 1 microsecond) for each element.")
+ days = _field_accessor("days", "days", "Number of days for each element.")
+ seconds = _field_accessor(
+ "seconds",
+ "seconds",
+ "Number of seconds (>= 0 and less than 1 day) " "for each element.",
+ )
+ microseconds = _field_accessor(
+ "microseconds",
+ "microseconds",
+ "Number of microseconds (>= 0 and less " "than 1 second) for each element.",
+ )
+ nanoseconds = _field_accessor(
+ "nanoseconds",
+ "nanoseconds",
+ "Number of nanoseconds (>= 0 and less " "than 1 microsecond) for each element.",
+ )
@property
def components(self):
@@ -837,21 +887,31 @@ def components(self):
"""
from pandas import DataFrame
- columns = ['days', 'hours', 'minutes', 'seconds',
- 'milliseconds', 'microseconds', 'nanoseconds']
+ columns = [
+ "days",
+ "hours",
+ "minutes",
+ "seconds",
+ "milliseconds",
+ "microseconds",
+ "nanoseconds",
+ ]
hasnans = self._hasnans
if hasnans:
+
def f(x):
if isna(x):
return [np.nan] * len(columns)
return x.components
+
else:
+
def f(x):
return x.components
result = DataFrame([f(x) for x in self], columns=columns)
if not hasnans:
- result = result.astype('int64')
+ result = result.astype("int64")
return result
@@ -861,6 +921,7 @@ def f(x):
# ---------------------------------------------------------------------
# Constructor Helpers
+
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
"""
Parameters
@@ -894,7 +955,7 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
unit = parse_timedelta_unit(unit)
# Unwrap whatever we have into a np.ndarray
- if not hasattr(data, 'dtype'):
+ if not hasattr(data, "dtype"):
# e.g. list, tuple
if np.ndim(data) == 0:
# i.e. generator
@@ -926,7 +987,7 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
frac = data - base
if p:
frac = np.round(frac, p)
- data = (base * m + (frac * m).astype(np.int64)).view('timedelta64[ns]')
+ data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
data[mask] = iNaT
copy = False
@@ -939,21 +1000,27 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
elif is_datetime64_dtype(data):
# GH#23539
- warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is "
- "deprecated, will raise a TypeError in a future "
- "version",
- FutureWarning, stacklevel=4)
+ warnings.warn(
+ "Passing datetime64-dtype data to TimedeltaIndex is "
+ "deprecated, will raise a TypeError in a future "
+ "version",
+ FutureWarning,
+ stacklevel=4,
+ )
data = ensure_int64(data).view(_TD_DTYPE)
else:
- raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]"
- .format(dtype=data.dtype))
+ raise TypeError(
+ "dtype {dtype} cannot be converted to timedelta64[ns]".format(
+ dtype=data.dtype
+ )
+ )
data = np.array(data, copy=copy)
if data.ndim != 1:
raise ValueError("Only 1-dimensional input arrays are supported.")
- assert data.dtype == 'm8[ns]', data
+ assert data.dtype == "m8[ns]", data
return data, inferred_freq
@@ -1028,19 +1095,20 @@ def objects_to_td64ns(data, unit="ns", errors="raise"):
# coerce Index to np.ndarray, converting string-dtype if necessary
values = np.array(data, dtype=np.object_, copy=False)
- result = array_to_timedelta64(values,
- unit=unit, errors=errors)
- return result.view('timedelta64[ns]')
+ result = array_to_timedelta64(values, unit=unit, errors=errors)
+ return result.view("timedelta64[ns]")
def _validate_td64_dtype(dtype):
dtype = pandas_dtype(dtype)
if is_dtype_equal(dtype, np.dtype("timedelta64")):
dtype = _TD_DTYPE
- msg = textwrap.dedent("""\
+ msg = textwrap.dedent(
+ """\
Passing in 'timedelta' dtype with no precision is deprecated
and will raise in a future version. Please pass in
- 'timedelta64[ns]' instead.""")
+ 'timedelta64[ns]' instead."""
+ )
warnings.warn(msg, FutureWarning, stacklevel=4)
if not is_dtype_equal(dtype, _TD_DTYPE):
@@ -1062,8 +1130,9 @@ def _generate_regular_range(start, end, periods, offset):
e = Timedelta(end).value + stride
b = e - periods * stride
else:
- raise ValueError("at least 'start' or 'end' should be specified "
- "if a 'period' is given.")
+ raise ValueError(
+ "at least 'start' or 'end' should be specified " "if a 'period' is given."
+ )
data = np.arange(b, e, stride, dtype=np.int64)
return data
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 93db65deff820..15baf1bed0ecd 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -16,9 +16,17 @@
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.common import (
- is_categorical_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype,
- is_datetimelike, is_extension_array_dtype, is_extension_type, is_list_like,
- is_object_dtype, is_scalar, is_timedelta64_ns_dtype)
+ is_categorical_dtype,
+ is_datetime64_ns_dtype,
+ is_datetime64tz_dtype,
+ is_datetimelike,
+ is_extension_array_dtype,
+ is_extension_type,
+ is_list_like,
+ is_object_dtype,
+ is_scalar,
+ is_timedelta64_ns_dtype,
+)
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna
@@ -28,14 +36,19 @@
import pandas.core.nanops as nanops
_shared_docs = dict()
-_indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='',
- unique='IndexOpsMixin', duplicated='IndexOpsMixin')
+_indexops_doc_kwargs = dict(
+ klass="IndexOpsMixin",
+ inplace="",
+ unique="IndexOpsMixin",
+ duplicated="IndexOpsMixin",
+)
class StringMixin:
"""
Implements string methods so long as object defines a `__str__` method.
"""
+
# side note - this could be made into a metaclass if more than one
# object needs
@@ -75,7 +88,7 @@ def _reset_cache(self, key=None):
"""
Reset cached properties. If ``key`` is passed, only clears that key.
"""
- if getattr(self, '_cache', None) is None:
+ if getattr(self, "_cache", None) is None:
return
if key is None:
self._cache.clear()
@@ -87,7 +100,7 @@ def __sizeof__(self):
Generates the total memory usage for an object that returns
either a value or Series of values
"""
- if hasattr(self, 'memory_usage'):
+ if hasattr(self, "memory_usage"):
mem = self.memory_usage(deep=True)
if not is_scalar(mem):
mem = mem.sum()
@@ -120,12 +133,14 @@ def __setattr__(self, key, value):
# because
# 1.) getattr is false for attributes that raise errors
# 2.) cls.__dict__ doesn't traverse into base classes
- if (getattr(self, "__frozen", False) and not
- (key == "_cache" or
- key in type(self).__dict__ or
- getattr(self, key, None) is not None)):
- raise AttributeError("You cannot add any new attribute '{key}'".
- format(key=key))
+ if getattr(self, "__frozen", False) and not (
+ key == "_cache"
+ or key in type(self).__dict__
+ or getattr(self, key, None) is not None
+ ):
+ raise AttributeError(
+ "You cannot add any new attribute '{key}'".format(key=key)
+ )
object.__setattr__(self, key, value)
@@ -146,43 +161,44 @@ class SelectionMixin:
mixin implementing the selection & aggregation interface on a group-like
object sub-classes need to define: obj, exclusions
"""
+
_selection = None
- _internal_names = ['_cache', '__setstate__']
+ _internal_names = ["_cache", "__setstate__"]
_internal_names_set = set(_internal_names)
- _builtin_table = OrderedDict((
- (builtins.sum, np.sum),
- (builtins.max, np.max),
- (builtins.min, np.min),
- ))
-
- _cython_table = OrderedDict((
- (builtins.sum, 'sum'),
- (builtins.max, 'max'),
- (builtins.min, 'min'),
- (np.all, 'all'),
- (np.any, 'any'),
- (np.sum, 'sum'),
- (np.nansum, 'sum'),
- (np.mean, 'mean'),
- (np.nanmean, 'mean'),
- (np.prod, 'prod'),
- (np.nanprod, 'prod'),
- (np.std, 'std'),
- (np.nanstd, 'std'),
- (np.var, 'var'),
- (np.nanvar, 'var'),
- (np.median, 'median'),
- (np.nanmedian, 'median'),
- (np.max, 'max'),
- (np.nanmax, 'max'),
- (np.min, 'min'),
- (np.nanmin, 'min'),
- (np.cumprod, 'cumprod'),
- (np.nancumprod, 'cumprod'),
- (np.cumsum, 'cumsum'),
- (np.nancumsum, 'cumsum'),
- ))
+ _builtin_table = OrderedDict(
+ ((builtins.sum, np.sum), (builtins.max, np.max), (builtins.min, np.min))
+ )
+
+ _cython_table = OrderedDict(
+ (
+ (builtins.sum, "sum"),
+ (builtins.max, "max"),
+ (builtins.min, "min"),
+ (np.all, "all"),
+ (np.any, "any"),
+ (np.sum, "sum"),
+ (np.nansum, "sum"),
+ (np.mean, "mean"),
+ (np.nanmean, "mean"),
+ (np.prod, "prod"),
+ (np.nanprod, "prod"),
+ (np.std, "std"),
+ (np.nanstd, "std"),
+ (np.var, "var"),
+ (np.nanvar, "var"),
+ (np.median, "median"),
+ (np.nanmedian, "median"),
+ (np.max, "max"),
+ (np.nanmax, "max"),
+ (np.min, "min"),
+ (np.nanmin, "min"),
+ (np.cumprod, "cumprod"),
+ (np.nancumprod, "cumprod"),
+ (np.cumsum, "cumsum"),
+ (np.nancumsum, "cumsum"),
+ )
+ )
@property
def _selection_name(self):
@@ -198,8 +214,9 @@ def _selection_name(self):
@property
def _selection_list(self):
- if not isinstance(self._selection, (list, tuple, ABCSeries,
- ABCIndexClass, np.ndarray)):
+ if not isinstance(
+ self._selection, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)
+ ):
return [self._selection]
return self._selection
@@ -217,8 +234,7 @@ def ndim(self):
@cache_readonly
def _obj_with_exclusions(self):
- if self._selection is not None and isinstance(self.obj,
- ABCDataFrame):
+ if self._selection is not None and isinstance(self.obj, ABCDataFrame):
return self.obj.reindex(columns=self._selection_list)
if len(self.exclusions) > 0:
@@ -228,18 +244,21 @@ def _obj_with_exclusions(self):
def __getitem__(self, key):
if self._selection is not None:
- raise IndexError('Column(s) {selection} already selected'
- .format(selection=self._selection))
+ raise IndexError(
+ "Column(s) {selection} already selected".format(
+ selection=self._selection
+ )
+ )
- if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass,
- np.ndarray)):
+ if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)):
if len(self.obj.columns.intersection(key)) != len(key):
bad_keys = list(set(key).difference(self.obj.columns))
- raise KeyError("Columns not found: {missing}"
- .format(missing=str(bad_keys)[1:-1]))
+ raise KeyError(
+ "Columns not found: {missing}".format(missing=str(bad_keys)[1:-1])
+ )
return self._gotitem(list(key), ndim=2)
- elif not getattr(self, 'as_index', False):
+ elif not getattr(self, "as_index", False):
if key not in self.obj.columns:
raise KeyError("Column not found: {key}".format(key=key))
return self._gotitem(key, ndim=2)
@@ -288,8 +307,9 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs):
# people may try to aggregate on a non-callable attribute
# but don't let them think they can pass args to it
assert len(args) == 0
- assert len([kwarg for kwarg in kwargs
- if kwarg not in ['axis', '_level']]) == 0
+ assert (
+ len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0
+ )
return f
f = getattr(np, arg, None)
@@ -320,34 +340,35 @@ def _aggregate(self, arg, *args, **kwargs):
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
is_nested_renamer = False
- _axis = kwargs.pop('_axis', None)
+ _axis = kwargs.pop("_axis", None)
if _axis is None:
- _axis = getattr(self, 'axis', 0)
- _level = kwargs.pop('_level', None)
+ _axis = getattr(self, "axis", 0)
+ _level = kwargs.pop("_level", None)
if isinstance(arg, str):
- return self._try_aggregate_string_function(arg, *args,
- **kwargs), None
+ return self._try_aggregate_string_function(arg, *args, **kwargs), None
if isinstance(arg, dict):
# aggregate based on the passed dict
if _axis != 0: # pragma: no cover
- raise ValueError('Can only pass dict with axis=0')
+ raise ValueError("Can only pass dict with axis=0")
obj = self._selected_obj
def nested_renaming_depr(level=4):
# deprecation of nested renaming
# GH 15931
- msg = textwrap.dedent("""\
+ msg = textwrap.dedent(
+ """\
using a dict with renaming is deprecated and will be removed
in a future version.
For column-specific groupby renaming, use named aggregation
>>> df.groupby(...).agg(name=('column', aggfunc))
- """)
+ """
+ )
warnings.warn(msg, FutureWarning, stacklevel=level)
# if we have a dict of any non-scalars
@@ -375,17 +396,17 @@ def nested_renaming_depr(level=4):
is_nested_renamer = True
if k not in obj.columns:
- msg = ('cannot perform renaming for {key} with a '
- 'nested dictionary').format(key=k)
+ msg = (
+ "cannot perform renaming for {key} with a "
+ "nested dictionary"
+ ).format(key=k)
raise SpecificationError(msg)
nested_renaming_depr(4 + (_level or 0))
elif isinstance(obj, ABCSeries):
nested_renaming_depr()
- elif (isinstance(obj, ABCDataFrame) and
- k not in obj.columns):
- raise KeyError(
- "Column '{col}' does not exist!".format(col=k))
+ elif isinstance(obj, ABCDataFrame) and k not in obj.columns:
+ raise KeyError("Column '{col}' does not exist!".format(col=k))
arg = new_arg
@@ -393,8 +414,9 @@ def nested_renaming_depr(level=4):
# deprecation of renaming keys
# GH 15931
keys = list(arg.keys())
- if (isinstance(obj, ABCDataFrame) and
- len(obj.columns.intersection(keys)) != len(keys)):
+ if isinstance(obj, ABCDataFrame) and len(
+ obj.columns.intersection(keys)
+ ) != len(keys):
nested_renaming_depr()
from pandas.core.reshape.concat import concat
@@ -405,16 +427,16 @@ def _agg_1dim(name, how, subset=None):
"""
colg = self._gotitem(name, ndim=1, subset=subset)
if colg.ndim != 1:
- raise SpecificationError("nested dictionary is ambiguous "
- "in aggregation")
+ raise SpecificationError(
+ "nested dictionary is ambiguous " "in aggregation"
+ )
return colg.aggregate(how, _level=(_level or 0) + 1)
def _agg_2dim(name, how):
"""
aggregate a 2-dim with how
"""
- colg = self._gotitem(self._selection, ndim=2,
- subset=obj)
+ colg = self._gotitem(self._selection, ndim=2, subset=obj)
return colg.aggregate(how, _level=None)
def _agg(arg, func):
@@ -456,8 +478,9 @@ def _agg(arg, func):
# but may have multiple aggregations
if len(sl) == 1:
- result = _agg(arg, lambda fname,
- agg_how: _agg_1dim(self._selection, agg_how))
+ result = _agg(
+ arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how)
+ )
# we are selecting the same set as we are aggregating
elif not len(sl - set(keys)):
@@ -488,8 +511,7 @@ def is_any_series():
def is_any_frame():
# return a boolean if we have *any* nested series
- return any(isinstance(r, ABCDataFrame)
- for r in result.values())
+ return any(isinstance(r, ABCDataFrame) for r in result.values())
if isinstance(result, list):
return concat(result, keys=keys, axis=1, sort=True), True
@@ -498,8 +520,7 @@ def is_any_frame():
# we have a dict of DataFrames
# return a MI DataFrame
- return concat([result[k] for k in keys],
- keys=keys, axis=1), True
+ return concat([result[k] for k in keys], keys=keys, axis=1), True
elif isinstance(self, ABCSeries) and is_any_series():
@@ -512,28 +533,28 @@ def is_any_frame():
# we have non-same sized objects, so
# we don't automatically broadcast
- raise ValueError("cannot perform both aggregation "
- "and transformation operations "
- "simultaneously")
+ raise ValueError(
+ "cannot perform both aggregation "
+ "and transformation operations "
+ "simultaneously"
+ )
return result, True
# fall thru
from pandas import DataFrame, Series
+
try:
result = DataFrame(result)
except ValueError:
# we have a dict of scalars
- result = Series(result,
- name=getattr(self, 'name', None))
+ result = Series(result, name=getattr(self, "name", None))
return result, True
elif is_list_like(arg):
# we require a list, but not an 'str'
- return self._aggregate_multiple_funcs(arg,
- _level=_level,
- _axis=_axis), None
+ return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None
else:
result = None
@@ -577,8 +598,7 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis):
else:
for index, col in enumerate(obj):
try:
- colg = self._gotitem(col, ndim=1,
- subset=obj.iloc[:, index])
+ colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index])
results.append(colg.aggregate(arg))
keys.append(col)
except (TypeError, DataError):
@@ -602,10 +622,12 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis):
from pandas.core.dtypes.cast import is_nested_object
from pandas import Series
+
result = Series(results, index=keys, name=self.name)
if is_nested_object(result):
- raise ValueError("cannot combine transform and "
- "aggregation operations")
+ raise ValueError(
+ "cannot combine transform and " "aggregation operations"
+ )
return result
def _shallow_copy(self, obj=None, obj_type=None, **kwargs):
@@ -656,8 +678,11 @@ def transpose(self, *args, **kwargs):
nv.validate_transpose(args, kwargs)
return self
- T = property(transpose, doc="""\nReturn the transpose, which is by
- definition self.\n""")
+ T = property(
+ transpose,
+ doc="""\nReturn the transpose, which is by
+ definition self.\n""",
+ )
@property
def _is_homogeneous_type(self):
@@ -700,8 +725,11 @@ def item(self):
scalar
The first element of %(klass)s.
"""
- warnings.warn('`item` has been deprecated and will be removed in a '
- 'future version', FutureWarning, stacklevel=2)
+ warnings.warn(
+ "`item` has been deprecated and will be removed in a " "future version",
+ FutureWarning,
+ stacklevel=2,
+ )
return self.values.item()
@property
@@ -711,9 +739,12 @@ def data(self):
.. deprecated:: 0.23.0
"""
- warnings.warn("{obj}.data is deprecated and will be removed "
- "in a future version".format(obj=type(self).__name__),
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "{obj}.data is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning,
+ stacklevel=2,
+ )
return self.values.data
@property
@@ -723,9 +754,12 @@ def itemsize(self):
.. deprecated:: 0.23.0
"""
- warnings.warn("{obj}.itemsize is deprecated and will be removed "
- "in a future version".format(obj=type(self).__name__),
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "{obj}.itemsize is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning,
+ stacklevel=2,
+ )
return self._ndarray_values.itemsize
@property
@@ -742,9 +776,12 @@ def strides(self):
.. deprecated:: 0.23.0
"""
- warnings.warn("{obj}.strides is deprecated and will be removed "
- "in a future version".format(obj=type(self).__name__),
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "{obj}.strides is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning,
+ stacklevel=2,
+ )
return self._ndarray_values.strides
@property
@@ -761,9 +798,12 @@ def flags(self):
.. deprecated:: 0.23.0
"""
- warnings.warn("{obj}.flags is deprecated and will be removed "
- "in a future version".format(obj=type(self).__name__),
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "{obj}.flags is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning,
+ stacklevel=2,
+ )
return self.values.flags
@property
@@ -773,9 +813,12 @@ def base(self):
.. deprecated:: 0.23.0
"""
- warnings.warn("{obj}.base is deprecated and will be removed "
- "in a future version".format(obj=type(self).__name__),
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "{obj}.base is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning,
+ stacklevel=2,
+ )
return self.values.base
@property
@@ -849,13 +892,16 @@ def array(self) -> ExtensionArray:
if is_datetime64_ns_dtype(result.dtype):
from pandas.arrays import DatetimeArray
+
result = DatetimeArray(result)
elif is_timedelta64_ns_dtype(result.dtype):
from pandas.arrays import TimedeltaArray
+
result = TimedeltaArray(result)
elif not is_extension_array_dtype(result.dtype):
from pandas.core.arrays.numpy_ import PandasArray
+
result = PandasArray(result)
return result
@@ -1156,13 +1202,17 @@ def hasnans(self):
"""
return bool(isna(self).any())
- def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
- filter_type=None, **kwds):
+ def _reduce(
+ self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
+ ):
""" perform the reduction type operation if we can """
func = getattr(self, name, None)
if func is None:
- raise TypeError("{klass} cannot perform the operation {op}".format(
- klass=self.__class__.__name__, op=name))
+ raise TypeError(
+ "{klass} cannot perform the operation {op}".format(
+ klass=self.__class__.__name__, op=name
+ )
+ )
return func(skipna=skipna, **kwds)
def _map_values(self, mapper, na_action=None):
@@ -1191,7 +1241,7 @@ def _map_values(self, mapper, na_action=None):
# as we know that we are not going to have to yield
# python types
if isinstance(mapper, dict):
- if hasattr(mapper, '__missing__'):
+ if hasattr(mapper, "__missing__"):
# If a dictionary subclass defines a default value method,
# convert mapper to a lookup function (GH #15999).
dict_with_default = mapper
@@ -1202,6 +1252,7 @@ def _map_values(self, mapper, na_action=None):
# we specify the keys here to handle the
# possibility that they are tuples
from pandas import Series
+
mapper = Series(mapper)
if isinstance(mapper, ABCSeries):
@@ -1229,11 +1280,12 @@ def _map_values(self, mapper, na_action=None):
map_f = lambda values, f: values.map(f)
else:
values = self.astype(object)
- values = getattr(values, 'values', values)
- if na_action == 'ignore':
+ values = getattr(values, "values", values)
+ if na_action == "ignore":
+
def map_f(values, f):
- return lib.map_infer_mask(values, f,
- isna(values).view(np.uint8))
+ return lib.map_infer_mask(values, f, isna(values).view(np.uint8))
+
else:
map_f = lib.map_infer
@@ -1242,8 +1294,9 @@ def map_f(values, f):
return new_values
- def value_counts(self, normalize=False, sort=True, ascending=False,
- bins=None, dropna=True):
+ def value_counts(
+ self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
+ ):
"""
Return a Series containing counts of unique values.
@@ -1322,18 +1375,26 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
dtype: int64
"""
from pandas.core.algorithms import value_counts
- result = value_counts(self, sort=sort, ascending=ascending,
- normalize=normalize, bins=bins, dropna=dropna)
+
+ result = value_counts(
+ self,
+ sort=sort,
+ ascending=ascending,
+ normalize=normalize,
+ bins=bins,
+ dropna=dropna,
+ )
return result
def unique(self):
values = self._values
- if hasattr(values, 'unique'):
+ if hasattr(values, "unique"):
result = values.unique()
else:
from pandas.core.algorithms import unique1d
+
result = unique1d(values)
return result
@@ -1402,6 +1463,7 @@ def is_monotonic(self):
bool
"""
from pandas import Index
+
return Index(self).is_monotonic
is_monotonic_increasing = is_monotonic
@@ -1419,6 +1481,7 @@ def is_monotonic_decreasing(self):
bool
"""
from pandas import Index
+
return Index(self).is_monotonic_decreasing
def memory_usage(self, deep=False):
@@ -1444,7 +1507,7 @@ def memory_usage(self, deep=False):
Memory usage does not include memory consumed by elements that
are not components of the array if deep=False or if used on PyPy
"""
- if hasattr(self.array, 'memory_usage'):
+ if hasattr(self.array, "memory_usage"):
return self.array.memory_usage(deep=deep)
v = self.array.nbytes
@@ -1453,18 +1516,24 @@ def memory_usage(self, deep=False):
return v
@Substitution(
- values='', order='', size_hint='',
- sort=textwrap.dedent("""\
+ values="",
+ order="",
+ size_hint="",
+ sort=textwrap.dedent(
+ """\
sort : boolean, default False
Sort `uniques` and shuffle `labels` to maintain the
relationship.
- """))
- @Appender(algorithms._shared_docs['factorize'])
+ """
+ ),
+ )
+ @Appender(algorithms._shared_docs["factorize"])
def factorize(self, sort=False, na_sentinel=-1):
return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
- _shared_docs['searchsorted'] = (
- """
+ _shared_docs[
+ "searchsorted"
+ ] = """
Find indices where elements should be inserted to maintain order.
Find the indices into a sorted %(klass)s `self` such that, if the
@@ -1534,16 +1603,15 @@ def factorize(self, sort=False, na_sentinel=-1):
>>> x.searchsorted(['bread'], side='right')
array([3])
- """)
+ """
- @Substitution(klass='Index')
- @Appender(_shared_docs['searchsorted'])
- def searchsorted(self, value, side='left', sorter=None):
- return algorithms.searchsorted(self._values, value,
- side=side, sorter=sorter)
+ @Substitution(klass="Index")
+ @Appender(_shared_docs["searchsorted"])
+ def searchsorted(self, value, side="left", sorter=None):
+ return algorithms.searchsorted(self._values, value, side=side, sorter=sorter)
- def drop_duplicates(self, keep='first', inplace=False):
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ def drop_duplicates(self, keep="first", inplace=False):
+ inplace = validate_bool_kwarg(inplace, "inplace")
if isinstance(self, ABCIndexClass):
if self.is_unique:
return self._shallow_copy()
@@ -1555,15 +1623,17 @@ def drop_duplicates(self, keep='first', inplace=False):
else:
return result
- def duplicated(self, keep='first'):
+ def duplicated(self, keep="first"):
from pandas.core.algorithms import duplicated
+
if isinstance(self, ABCIndexClass):
if self.is_unique:
return np.zeros(len(self), dtype=np.bool)
return duplicated(self, keep=keep)
else:
- return self._constructor(duplicated(self, keep=keep),
- index=self.index).__finalize__(self)
+ return self._constructor(
+ duplicated(self, keep=keep), index=self.index
+ ).__finalize__(self)
# ----------------------------------------------------------------------
# abstracts
diff --git a/pandas/core/common.py b/pandas/core/common.py
index 771ded04f461d..d2dd0d03d9425 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -18,7 +18,11 @@
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.common import (
- is_array_like, is_bool_dtype, is_extension_array_dtype, is_integer)
+ is_array_like,
+ is_bool_dtype,
+ is_extension_array_dtype,
+ is_integer,
+)
from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
from pandas.core.dtypes.inference import _iterable_not_string
from pandas.core.dtypes.missing import isna, isnull, notnull # noqa
@@ -114,9 +118,10 @@ def is_bool_indexer(key: Any) -> bool:
When the array is an object-dtype ndarray or ExtensionArray
and contains missing values.
"""
- na_msg = 'cannot index with vector containing NA / NaN values'
- if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or
- (is_array_like(key) and is_extension_array_dtype(key.dtype))):
+ na_msg = "cannot index with vector containing NA / NaN values"
+ if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
+ is_array_like(key) and is_extension_array_dtype(key.dtype)
+ ):
if key.dtype == np.object_:
key = np.asarray(values_from_object(key))
@@ -234,7 +239,7 @@ def dict_keys_to_ordered_list(mapping):
def asarray_tuplesafe(values, dtype=None):
- if not (isinstance(values, (list, tuple)) or hasattr(values, '__array__')):
+ if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")):
values = list(values)
elif isinstance(values, ABCIndexClass):
return values.values
@@ -302,8 +307,12 @@ def is_null_slice(obj):
"""
We have a null slice.
"""
- return (isinstance(obj, slice) and obj.start is None and
- obj.stop is None and obj.step is None)
+ return (
+ isinstance(obj, slice)
+ and obj.start is None
+ and obj.stop is None
+ and obj.step is None
+ )
def is_true_slices(l):
@@ -318,19 +327,20 @@ def is_full_slice(obj, l):
"""
We have a full length slice.
"""
- return (isinstance(obj, slice) and obj.start == 0 and obj.stop == l and
- obj.step is None)
+ return (
+ isinstance(obj, slice) and obj.start == 0 and obj.stop == l and obj.step is None
+ )
def get_callable_name(obj):
# typical case has name
- if hasattr(obj, '__name__'):
- return getattr(obj, '__name__')
+ if hasattr(obj, "__name__"):
+ return getattr(obj, "__name__")
# some objects don't; could recurse
if isinstance(obj, partial):
return get_callable_name(obj.func)
# fall back to class name
- if hasattr(obj, '__call__'):
+ if hasattr(obj, "__call__"):
return obj.__class__.__name__
# everything failed (probably because the argument
# wasn't actually callable); we return None
@@ -399,14 +409,12 @@ def standardize_mapping(into):
"""
if not inspect.isclass(into):
if isinstance(into, collections.defaultdict):
- return partial(
- collections.defaultdict, into.default_factory)
+ return partial(collections.defaultdict, into.default_factory)
into = type(into)
if not issubclass(into, abc.Mapping):
- raise TypeError('unsupported type: {into}'.format(into=into))
+ raise TypeError("unsupported type: {into}".format(into=into))
elif into == collections.defaultdict:
- raise TypeError(
- 'to_dict() only accepts initialized defaultdicts')
+ raise TypeError("to_dict() only accepts initialized defaultdicts")
return into
@@ -435,8 +443,9 @@ def random_state(state=None):
elif state is None:
return np.random
else:
- raise ValueError("random_state must be an integer, a numpy "
- "RandomState, or None")
+ raise ValueError(
+ "random_state must be an integer, a numpy " "RandomState, or None"
+ )
def _pipe(obj, func, *args, **kwargs):
@@ -466,7 +475,7 @@ def _pipe(obj, func, *args, **kwargs):
if isinstance(func, tuple):
func, target = func
if target in kwargs:
- msg = '%s is both the pipe target and a keyword argument' % target
+ msg = "%s is both the pipe target and a keyword argument" % target
raise ValueError(msg)
kwargs[target] = obj
return func(*args, **kwargs)
@@ -486,6 +495,7 @@ def f(x):
return mapper[x]
else:
return x
+
else:
f = mapper
diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py
index a7524161dd80e..1046401850963 100644
--- a/pandas/core/computation/align.py
+++ b/pandas/core/computation/align.py
@@ -18,25 +18,23 @@ def _align_core_single_unary_op(term):
typ = partial(np.asanyarray, dtype=term.value.dtype)
else:
typ = type(term.value)
- ret = typ,
+ ret = (typ,)
- if not hasattr(term.value, 'axes'):
- ret += None,
+ if not hasattr(term.value, "axes"):
+ ret += (None,)
else:
- ret += _zip_axes_from_type(typ, term.value.axes),
+ ret += (_zip_axes_from_type(typ, term.value.axes),)
return ret
def _zip_axes_from_type(typ, new_axes):
- axes = {ax_name: new_axes[ax_ind]
- for ax_ind, ax_name in typ._AXIS_NAMES.items()}
+ axes = {ax_name: new_axes[ax_ind] for ax_ind, ax_name in typ._AXIS_NAMES.items()}
return axes
def _any_pandas_objects(terms):
"""Check a sequence of terms for instances of PandasObject."""
- return any(isinstance(term.value, pd.core.generic.PandasObject)
- for term in terms)
+ return any(isinstance(term.value, pd.core.generic.PandasObject) for term in terms)
def _filter_special_cases(f):
@@ -53,13 +51,13 @@ def wrapper(terms):
return _result_type_many(*term_values), None
return f(terms)
+
return wrapper
@_filter_special_cases
def _align_core(terms):
- term_index = [i for i, term in enumerate(terms)
- if hasattr(term.value, 'axes')]
+ term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")]
term_dims = [terms[i].value.ndim for i in term_index]
ndims = pd.Series(dict(zip(term_index, term_dims)))
@@ -81,13 +79,13 @@ def _align_core(terms):
ax, itm = axis, items
if not axes[ax].is_(itm):
- axes[ax] = axes[ax].join(itm, how='outer')
+ axes[ax] = axes[ax].join(itm, how="outer")
for i, ndim in ndims.items():
for axis, items in zip(range(ndim), axes):
ti = terms[i].value
- if hasattr(ti, 'reindex'):
+ if hasattr(ti, "reindex"):
transpose = isinstance(ti, pd.Series) and naxes > 1
reindexer = axes[naxes - 1] if transpose else items
@@ -96,10 +94,11 @@ def _align_core(terms):
ordm = np.log10(max(1, abs(reindexer_size - term_axis_size)))
if ordm >= 1 and reindexer_size >= 10000:
- w = ('Alignment difference on axis {axis} is larger '
- 'than an order of magnitude on term {term!r}, by '
- 'more than {ordm:.4g}; performance may suffer'
- ).format(axis=axis, term=terms[i].name, ordm=ordm)
+ w = (
+ "Alignment difference on axis {axis} is larger "
+ "than an order of magnitude on term {term!r}, by "
+ "more than {ordm:.4g}; performance may suffer"
+ ).format(axis=axis, term=terms[i].name, ordm=ordm)
warnings.warn(w, category=PerformanceWarning, stacklevel=6)
f = partial(ti.reindex, reindexer, axis=axis, copy=False)
@@ -158,12 +157,11 @@ def _reconstruct_object(typ, obj, axes, dtype):
res_t = np.result_type(obj.dtype, dtype)
- if (not isinstance(typ, partial) and
- issubclass(typ, pd.core.generic.PandasObject)):
+ if not isinstance(typ, partial) and issubclass(typ, pd.core.generic.PandasObject):
return typ(obj, dtype=res_t, **axes)
# special case for pathological things like ~True/~False
- if hasattr(res_t, 'type') and typ == np.bool_ and res_t != np.bool_:
+ if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_:
ret_value = res_t.type(obj)
else:
ret_value = typ(obj).astype(res_t)
diff --git a/pandas/core/computation/check.py b/pandas/core/computation/check.py
index fc6b9a2522824..4d205909b9e2e 100644
--- a/pandas/core/computation/check.py
+++ b/pandas/core/computation/check.py
@@ -1,11 +1,10 @@
from pandas.compat._optional import import_optional_dependency
-ne = import_optional_dependency("numexpr", raise_on_missing=False,
- on_version="warn")
+ne = import_optional_dependency("numexpr", raise_on_missing=False, on_version="warn")
_NUMEXPR_INSTALLED = ne is not None
if _NUMEXPR_INSTALLED:
_NUMEXPR_VERSION = ne.__version__
else:
_NUMEXPR_VERSION = None
-__all__ = ['_NUMEXPR_INSTALLED', '_NUMEXPR_VERSION']
+__all__ = ["_NUMEXPR_INSTALLED", "_NUMEXPR_VERSION"]
diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py
index 6a0e7981ad82b..ddb1023479cba 100644
--- a/pandas/core/computation/common.py
+++ b/pandas/core/computation/common.py
@@ -11,7 +11,7 @@
def _ensure_decoded(s):
""" if we have bytes, decode them to unicode """
if isinstance(s, (np.bytes_, bytes)):
- s = s.decode(pd.get_option('display.encoding'))
+ s = s.decode(pd.get_option("display.encoding"))
return s
diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py
index c75552d15441d..2c94b142a45b3 100644
--- a/pandas/core/computation/engines.py
+++ b/pandas/core/computation/engines.py
@@ -5,8 +5,7 @@
import abc
from pandas.core.computation.align import _align, _reconstruct_object
-from pandas.core.computation.ops import (
- UndefinedVariableError, _mathops, _reductions)
+from pandas.core.computation.ops import UndefinedVariableError, _mathops, _reductions
import pandas.io.formats.printing as printing
@@ -29,10 +28,11 @@ def _check_ne_builtin_clash(expr):
overlap = names & _ne_builtins
if overlap:
- s = ', '.join(map(repr, overlap))
- raise NumExprClobberingError('Variables in expression "{expr}" '
- 'overlap with builtins: ({s})'
- .format(expr=expr, s=s))
+ s = ", ".join(map(repr, overlap))
+ raise NumExprClobberingError(
+ 'Variables in expression "{expr}" '
+ "overlap with builtins: ({s})".format(expr=expr, s=s)
+ )
class AbstractEngine(metaclass=abc.ABCMeta):
@@ -68,8 +68,9 @@ def evaluate(self):
# make sure no names in resolvers and locals/globals clash
res = self._evaluate()
- return _reconstruct_object(self.result_type, res, self.aligned_axes,
- self.expr.terms.return_type)
+ return _reconstruct_object(
+ self.result_type, res, self.aligned_axes, self.expr.terms.return_type
+ )
@property
def _is_aligned(self):
@@ -95,6 +96,7 @@ def _evaluate(self):
class NumExprEngine(AbstractEngine):
"""NumExpr engine class"""
+
has_neg_frac = True
def __init__(self, expr):
@@ -112,7 +114,7 @@ def _evaluate(self):
try:
env = self.expr.env
scope = env.full_scope
- truediv = scope['truediv']
+ truediv = scope["truediv"]
_check_ne_builtin_clash(self.expr)
return ne.evaluate(s, local_dict=scope, truediv=truediv)
except KeyError as e:
@@ -130,6 +132,7 @@ class PythonEngine(AbstractEngine):
Mostly for testing purposes.
"""
+
has_neg_frac = False
def __init__(self, expr):
@@ -142,4 +145,4 @@ def _evaluate(self):
pass
-_engines = {'numexpr': NumExprEngine, 'python': PythonEngine}
+_engines = {"numexpr": NumExprEngine, "python": PythonEngine}
diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py
index ef4639a3afe4c..456ecf4b2594f 100644
--- a/pandas/core/computation/eval.py
+++ b/pandas/core/computation/eval.py
@@ -38,24 +38,28 @@ def _check_engine(engine):
if engine is None:
if _NUMEXPR_INSTALLED:
- engine = 'numexpr'
+ engine = "numexpr"
else:
- engine = 'python'
+ engine = "python"
if engine not in _engines:
valid = list(_engines.keys())
- raise KeyError('Invalid engine {engine!r} passed, valid engines are'
- ' {valid}'.format(engine=engine, valid=valid))
+ raise KeyError(
+ "Invalid engine {engine!r} passed, valid engines are"
+ " {valid}".format(engine=engine, valid=valid)
+ )
# TODO: validate this in a more general way (thinking of future engines
# that won't necessarily be import-able)
# Could potentially be done on engine instantiation
- if engine == 'numexpr':
+ if engine == "numexpr":
if not _NUMEXPR_INSTALLED:
- raise ImportError("'numexpr' is not installed or an "
- "unsupported version. Cannot use "
- "engine='numexpr' for query/eval "
- "if 'numexpr' is not installed")
+ raise ImportError(
+ "'numexpr' is not installed or an "
+ "unsupported version. Cannot use "
+ "engine='numexpr' for query/eval "
+ "if 'numexpr' is not installed"
+ )
return engine
@@ -76,17 +80,21 @@ def _check_parser(parser):
from pandas.core.computation.expr import _parsers
if parser not in _parsers:
- raise KeyError('Invalid parser {parser!r} passed, valid parsers are'
- ' {valid}'.format(parser=parser, valid=_parsers.keys()))
+ raise KeyError(
+ "Invalid parser {parser!r} passed, valid parsers are"
+ " {valid}".format(parser=parser, valid=_parsers.keys())
+ )
def _check_resolvers(resolvers):
if resolvers is not None:
for resolver in resolvers:
- if not hasattr(resolver, '__getitem__'):
+ if not hasattr(resolver, "__getitem__"):
name = type(resolver).__name__
- raise TypeError('Resolver of type {name!r} does not implement '
- 'the __getitem__ method'.format(name=name))
+ raise TypeError(
+ "Resolver of type {name!r} does not implement "
+ "the __getitem__ method".format(name=name)
+ )
def _check_expression(expr):
@@ -140,25 +148,36 @@ def _check_for_locals(expr, stack_level, parser):
from pandas.core.computation.expr import tokenize_string
at_top_of_stack = stack_level == 0
- not_pandas_parser = parser != 'pandas'
+ not_pandas_parser = parser != "pandas"
if not_pandas_parser:
msg = "The '@' prefix is only supported by the pandas parser"
elif at_top_of_stack:
- msg = ("The '@' prefix is not allowed in "
- "top-level eval calls, \nplease refer to "
- "your variables by name without the '@' "
- "prefix")
+ msg = (
+ "The '@' prefix is not allowed in "
+ "top-level eval calls, \nplease refer to "
+ "your variables by name without the '@' "
+ "prefix"
+ )
if at_top_of_stack or not_pandas_parser:
for toknum, tokval in tokenize_string(expr):
- if toknum == tokenize.OP and tokval == '@':
+ if toknum == tokenize.OP and tokval == "@":
raise SyntaxError(msg)
-def eval(expr, parser='pandas', engine=None, truediv=True,
- local_dict=None, global_dict=None, resolvers=(), level=0,
- target=None, inplace=False):
+def eval(
+ expr,
+ parser="pandas",
+ engine=None,
+ truediv=True,
+ local_dict=None,
+ global_dict=None,
+ resolvers=(),
+ level=0,
+ target=None,
+ inplace=False,
+):
"""
Evaluate a Python expression as a string using various backends.
@@ -269,14 +288,16 @@ def eval(expr, parser='pandas', engine=None, truediv=True,
if isinstance(expr, str):
_check_expression(expr)
- exprs = [e.strip() for e in expr.splitlines() if e.strip() != '']
+ exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""]
else:
exprs = [expr]
multi_line = len(exprs) > 1
if multi_line and target is None:
- raise ValueError("multi-line expressions are only valid in the "
- "context of data, use DataFrame.eval")
+ raise ValueError(
+ "multi-line expressions are only valid in the "
+ "context of data, use DataFrame.eval"
+ )
ret = None
first_expr = True
@@ -290,12 +311,15 @@ def eval(expr, parser='pandas', engine=None, truediv=True,
_check_for_locals(expr, level, parser)
# get our (possibly passed-in) scope
- env = _ensure_scope(level + 1, global_dict=global_dict,
- local_dict=local_dict, resolvers=resolvers,
- target=target)
+ env = _ensure_scope(
+ level + 1,
+ global_dict=global_dict,
+ local_dict=local_dict,
+ resolvers=resolvers,
+ target=target,
+ )
- parsed_expr = Expr(expr, engine=engine, parser=parser, env=env,
- truediv=truediv)
+ parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv)
# construct the engine and evaluate the parsed expression
eng = _engines[engine]
@@ -304,11 +328,12 @@ def eval(expr, parser='pandas', engine=None, truediv=True,
if parsed_expr.assigner is None:
if multi_line:
- raise ValueError("Multi-line expressions are only valid"
- " if all expressions contain an assignment")
+ raise ValueError(
+ "Multi-line expressions are only valid"
+ " if all expressions contain an assignment"
+ )
elif inplace:
- raise ValueError("Cannot operate inplace "
- "if there is no assignment")
+ raise ValueError("Cannot operate inplace " "if there is no assignment")
# assign if needed
assigner = parsed_expr.assigner
diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
index 32bd34c4db7d7..772fb547567e3 100644
--- a/pandas/core/computation/expr.py
+++ b/pandas/core/computation/expr.py
@@ -15,11 +15,27 @@
from pandas.core import common as com
from pandas.core.base import StringMixin
from pandas.core.computation.common import (
- _BACKTICK_QUOTED_STRING, _remove_spaces_column_name)
+ _BACKTICK_QUOTED_STRING,
+ _remove_spaces_column_name,
+)
from pandas.core.computation.ops import (
- _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
- UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
- _mathops, _reductions, _unary_ops_syms, is_term)
+ _LOCAL_TAG,
+ BinOp,
+ Constant,
+ Div,
+ FuncNode,
+ Op,
+ Term,
+ UnaryOp,
+ UndefinedVariableError,
+ _arith_ops_syms,
+ _bool_ops_syms,
+ _cmp_ops_syms,
+ _mathops,
+ _reductions,
+ _unary_ops_syms,
+ is_term,
+)
from pandas.core.computation.scope import Scope
import pandas.io.formats.printing as printing
@@ -40,10 +56,13 @@ def tokenize_string(source):
# Then, take all tokens till the next backtick to form a backtick quoted
# string.
for toknum, tokval, _, _, _ in token_generator:
- if tokval == '`':
- tokval = " ".join(it.takewhile(
- lambda tokval: tokval != '`',
- map(operator.itemgetter(1), token_generator)))
+ if tokval == "`":
+ tokval = " ".join(
+ it.takewhile(
+ lambda tokval: tokval != "`",
+ map(operator.itemgetter(1), token_generator),
+ )
+ )
toknum = _BACKTICK_QUOTED_STRING
yield toknum, tokval
@@ -63,7 +82,7 @@ def _rewrite_assign(tok):
Either the input or token or the replacement values
"""
toknum, tokval = tok
- return toknum, '==' if tokval == '=' else tokval
+ return toknum, "==" if tokval == "=" else tokval
def _replace_booleans(tok):
@@ -82,10 +101,10 @@ def _replace_booleans(tok):
"""
toknum, tokval = tok
if toknum == tokenize.OP:
- if tokval == '&':
- return tokenize.NAME, 'and'
- elif tokval == '|':
- return tokenize.NAME, 'or'
+ if tokval == "&":
+ return tokenize.NAME, "and"
+ elif tokval == "|":
+ return tokenize.NAME, "or"
return toknum, tokval
return toknum, tokval
@@ -110,7 +129,7 @@ def _replace_locals(tok):
is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
"""
toknum, tokval = tok
- if toknum == tokenize.OP and tokval == '@':
+ if toknum == tokenize.OP and tokval == "@":
return tokenize.OP, _LOCAL_TAG
return toknum, tokval
@@ -147,13 +166,19 @@ def _compose2(f, g):
def _compose(*funcs):
"""Compose 2 or more callables"""
- assert len(funcs) > 1, 'At least 2 callables must be passed to compose'
+ assert len(funcs) > 1, "At least 2 callables must be passed to compose"
return reduce(_compose2, funcs)
-def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
- _rewrite_assign,
- _clean_spaces_backtick_quoted_names)):
+def _preparse(
+ source,
+ f=_compose(
+ _replace_locals,
+ _replace_booleans,
+ _rewrite_assign,
+ _clean_spaces_backtick_quoted_names,
+ ),
+):
"""Compose a collection of tokenization functions
Parameters
@@ -177,7 +202,7 @@ def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
the ``tokenize`` module and ``tokval`` is a string.
"""
- assert callable(f), 'f must be callable'
+ assert callable(f), "f must be callable"
return tokenize.untokenize((f(x) for x in tokenize_string(source)))
@@ -191,15 +216,17 @@ def _is_type(t):
# partition all AST nodes
-_all_nodes = frozenset(filter(lambda x: isinstance(x, type) and
- issubclass(x, ast.AST),
- (getattr(ast, node) for node in dir(ast))))
+_all_nodes = frozenset(
+ filter(
+ lambda x: isinstance(x, type) and issubclass(x, ast.AST),
+ (getattr(ast, node) for node in dir(ast)),
+ )
+)
def _filter_nodes(superclass, all_nodes=_all_nodes):
"""Filter out AST nodes that are subclasses of ``superclass``."""
- node_names = (node.__name__ for node in all_nodes
- if issubclass(node, superclass))
+ node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass))
return frozenset(node_names)
@@ -221,25 +248,44 @@ def _filter_nodes(superclass, all_nodes=_all_nodes):
# nodes that we don't support directly but are needed for parsing
-_hacked_nodes = frozenset(['Assign', 'Module', 'Expr'])
-
-
-_unsupported_expr_nodes = frozenset(['Yield', 'GeneratorExp', 'IfExp',
- 'DictComp', 'SetComp', 'Repr', 'Lambda',
- 'Set', 'AST', 'Is', 'IsNot'])
+_hacked_nodes = frozenset(["Assign", "Module", "Expr"])
+
+
+_unsupported_expr_nodes = frozenset(
+ [
+ "Yield",
+ "GeneratorExp",
+ "IfExp",
+ "DictComp",
+ "SetComp",
+ "Repr",
+ "Lambda",
+ "Set",
+ "AST",
+ "Is",
+ "IsNot",
+ ]
+)
# these nodes are low priority or won't ever be supported (e.g., AST)
-_unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes |
- _arguments_nodes | _keyword_nodes | _alias_nodes |
- _expr_context_nodes | _unsupported_expr_nodes) -
- _hacked_nodes)
+_unsupported_nodes = (
+ _stmt_nodes
+ | _mod_nodes
+ | _handler_nodes
+ | _arguments_nodes
+ | _keyword_nodes
+ | _alias_nodes
+ | _expr_context_nodes
+ | _unsupported_expr_nodes
+) - _hacked_nodes
# we're adding a different assignment in some cases to be equality comparison
# and we don't want `stmt` and friends in their so get only the class whose
# names are capitalized
_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
-_msg = 'cannot both support and not support {intersection}'.format(
- intersection=_unsupported_nodes & _base_supported_nodes)
+_msg = "cannot both support and not support {intersection}".format(
+ intersection=_unsupported_nodes & _base_supported_nodes
+)
assert not _unsupported_nodes & _base_supported_nodes, _msg
@@ -249,8 +295,10 @@ def _node_not_implemented(node_name, cls):
"""
def f(self, *args, **kwargs):
- raise NotImplementedError("{name!r} nodes are not "
- "implemented".format(name=node_name))
+ raise NotImplementedError(
+ "{name!r} nodes are not " "implemented".format(name=node_name)
+ )
+
return f
@@ -262,14 +310,16 @@ def disallow(nodes):
-------
disallowed : callable
"""
+
def disallowed(cls):
cls.unsupported_nodes = ()
for node in nodes:
new_method = _node_not_implemented(node, cls)
- name = 'visit_{node}'.format(node=node)
+ name = "visit_{node}".format(node=node)
cls.unsupported_nodes += (name,)
setattr(cls, name, new_method)
return cls
+
return disallowed
@@ -290,25 +340,27 @@ def f(self, node, *args, **kwargs):
f : callable
"""
return partial(op_class, op_symbol, *args, **kwargs)
+
return f
-_op_classes = {'binary': BinOp, 'unary': UnaryOp}
+_op_classes = {"binary": BinOp, "unary": UnaryOp}
def add_ops(op_classes):
"""Decorator to add default implementation of ops."""
+
def f(cls):
for op_attr_name, op_class in op_classes.items():
- ops = getattr(cls, '{name}_ops'.format(name=op_attr_name))
- ops_map = getattr(cls, '{name}_op_nodes_map'.format(
- name=op_attr_name))
+ ops = getattr(cls, "{name}_ops".format(name=op_attr_name))
+ ops_map = getattr(cls, "{name}_op_nodes_map".format(name=op_attr_name))
for op in ops:
op_node = ops_map[op]
if op_node is not None:
made_op = _op_maker(op_class, op)
- setattr(cls, 'visit_{node}'.format(node=op_node), made_op)
+ setattr(cls, "visit_{node}".format(node=op_node), made_op)
return cls
+
return f
@@ -326,24 +378,43 @@ class BaseExprVisitor(ast.NodeVisitor):
parser : str
preparser : callable
"""
+
const_type = Constant # type: Type[Term]
term_type = Term
binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms
- binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'In', 'NotIn',
- 'BitAnd', 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult',
- None, 'Pow', 'FloorDiv', 'Mod')
+ binary_op_nodes = (
+ "Gt",
+ "Lt",
+ "GtE",
+ "LtE",
+ "Eq",
+ "NotEq",
+ "In",
+ "NotIn",
+ "BitAnd",
+ "BitOr",
+ "And",
+ "Or",
+ "Add",
+ "Sub",
+ "Mult",
+ None,
+ "Pow",
+ "FloorDiv",
+ "Mod",
+ )
binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))
unary_ops = _unary_ops_syms
- unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not'
+ unary_op_nodes = "UAdd", "USub", "Invert", "Not"
unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes))
rewrite_map = {
ast.Eq: ast.In,
ast.NotEq: ast.NotIn,
ast.In: ast.In,
- ast.NotIn: ast.NotIn
+ ast.NotIn: ast.NotIn,
}
def __init__(self, env, engine, parser, preparser=_preparse):
@@ -360,18 +431,18 @@ def visit(self, node, **kwargs):
node = ast.fix_missing_locations(ast.parse(clean))
except SyntaxError as e:
from keyword import iskeyword
+
if any(iskeyword(x) for x in clean.split()):
- e.msg = ("Python keyword not valid identifier"
- " in numexpr query")
+ e.msg = "Python keyword not valid identifier" " in numexpr query"
raise e
- method = 'visit_' + node.__class__.__name__
+ method = "visit_" + node.__class__.__name__
visitor = getattr(self, method)
return visitor(node, **kwargs)
def visit_Module(self, node, **kwargs):
if len(node.body) != 1:
- raise SyntaxError('only a single expression is allowed')
+ raise SyntaxError("only a single expression is allowed")
expr = node.body[0]
return self.visit(expr, **kwargs)
@@ -408,22 +479,29 @@ def _rewrite_membership_op(self, node, left, right):
def _maybe_transform_eq_ne(self, node, left=None, right=None):
if left is None:
- left = self.visit(node.left, side='left')
+ left = self.visit(node.left, side="left")
if right is None:
- right = self.visit(node.right, side='right')
- op, op_class, left, right = self._rewrite_membership_op(node, left,
- right)
+ right = self.visit(node.right, side="right")
+ op, op_class, left, right = self._rewrite_membership_op(node, left, right)
return op, op_class, left, right
def _maybe_downcast_constants(self, left, right):
f32 = np.dtype(np.float32)
- if (left.is_scalar and hasattr(left, 'value') and
- not right.is_scalar and right.return_type == f32):
+ if (
+ left.is_scalar
+ and hasattr(left, "value")
+ and not right.is_scalar
+ and right.return_type == f32
+ ):
# right is a float32 array, left is a scalar
name = self.env.add_tmp(np.float32(left.value))
left = self.term_type(name, self.env)
- if (right.is_scalar and hasattr(right, 'value') and
- not left.is_scalar and left.return_type == f32):
+ if (
+ right.is_scalar
+ and hasattr(right, "value")
+ and not left.is_scalar
+ and left.return_type == f32
+ ):
# left is a float32 array, right is a scalar
name = self.env.add_tmp(np.float32(right.value))
right = self.term_type(name, self.env)
@@ -437,25 +515,33 @@ def _maybe_eval(self, binop, eval_in_python):
# [1,2] in a + 2 * b
# in that case a + 2 * b will be evaluated using numexpr, and the "in"
# call will be evaluated using isin (in python space)
- return binop.evaluate(self.env, self.engine, self.parser,
- self.term_type, eval_in_python)
-
- def _maybe_evaluate_binop(self, op, op_class, lhs, rhs,
- eval_in_python=('in', 'not in'),
- maybe_eval_in_python=('==', '!=', '<', '>',
- '<=', '>=')):
+ return binop.evaluate(
+ self.env, self.engine, self.parser, self.term_type, eval_in_python
+ )
+
+ def _maybe_evaluate_binop(
+ self,
+ op,
+ op_class,
+ lhs,
+ rhs,
+ eval_in_python=("in", "not in"),
+ maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="),
+ ):
res = op(lhs, rhs)
if res.has_invalid_return_type:
- raise TypeError("unsupported operand type(s) for {op}:"
- " '{lhs}' and '{rhs}'".format(op=res.op,
- lhs=lhs.type,
- rhs=rhs.type))
-
- if self.engine != 'pytables':
- if (res.op in _cmp_ops_syms and
- getattr(lhs, 'is_datetime', False) or
- getattr(rhs, 'is_datetime', False)):
+ raise TypeError(
+ "unsupported operand type(s) for {op}:"
+ " '{lhs}' and '{rhs}'".format(op=res.op, lhs=lhs.type, rhs=rhs.type)
+ )
+
+ if self.engine != "pytables":
+ if (
+ res.op in _cmp_ops_syms
+ and getattr(lhs, "is_datetime", False)
+ or getattr(rhs, "is_datetime", False)
+ ):
# all date ops must be done in python bc numexpr doesn't work
# well with NaT
return self._maybe_eval(res, self.binary_ops)
@@ -463,13 +549,14 @@ def _maybe_evaluate_binop(self, op, op_class, lhs, rhs,
if res.op in eval_in_python:
# "in"/"not in" ops are always evaluated in python
return self._maybe_eval(res, eval_in_python)
- elif self.engine != 'pytables':
- if (getattr(lhs, 'return_type', None) == object or
- getattr(rhs, 'return_type', None) == object):
+ elif self.engine != "pytables":
+ if (
+ getattr(lhs, "return_type", None) == object
+ or getattr(rhs, "return_type", None) == object
+ ):
# evaluate "==" and "!=" in python if either of our operands
# has an object return type
- return self._maybe_eval(res, eval_in_python +
- maybe_eval_in_python)
+ return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
return res
def visit_BinOp(self, node, **kwargs):
@@ -478,7 +565,7 @@ def visit_BinOp(self, node, **kwargs):
return self._maybe_evaluate_binop(op, op_class, left, right)
def visit_Div(self, node, **kwargs):
- truediv = self.env.scope['truediv']
+ truediv = self.env.scope["truediv"]
return lambda lhs, rhs: Div(lhs, rhs, truediv)
def visit_UnaryOp(self, node, **kwargs):
@@ -512,15 +599,17 @@ def visit_Index(self, node, **kwargs):
def visit_Subscript(self, node, **kwargs):
value = self.visit(node.value)
slobj = self.visit(node.slice)
- result = pd.eval(slobj, local_dict=self.env, engine=self.engine,
- parser=self.parser)
+ result = pd.eval(
+ slobj, local_dict=self.env, engine=self.engine, parser=self.parser
+ )
try:
# a Term instance
v = value.value[result]
except AttributeError:
# an Op instance
- lhs = pd.eval(value, local_dict=self.env, engine=self.engine,
- parser=self.parser)
+ lhs = pd.eval(
+ value, local_dict=self.env, engine=self.engine, parser=self.parser
+ )
v = lhs[result]
name = self.env.add_tmp(v)
return self.term_type(name, env=self.env)
@@ -551,22 +640,24 @@ def visit_Assign(self, node, **kwargs):
"""
if len(node.targets) != 1:
- raise SyntaxError('can only assign a single expression')
+ raise SyntaxError("can only assign a single expression")
if not isinstance(node.targets[0], ast.Name):
- raise SyntaxError('left hand side of an assignment must be a '
- 'single name')
+ raise SyntaxError(
+ "left hand side of an assignment must be a " "single name"
+ )
if self.env.target is None:
- raise ValueError('cannot assign without a target object')
+ raise ValueError("cannot assign without a target object")
try:
assigner = self.visit(node.targets[0], **kwargs)
except UndefinedVariableError:
assigner = node.targets[0].id
- self.assigner = getattr(assigner, 'name', assigner)
+ self.assigner = getattr(assigner, "name", assigner)
if self.assigner is None:
- raise SyntaxError('left hand side of an assignment must be a '
- 'single resolvable name')
+ raise SyntaxError(
+ "left hand side of an assignment must be a " "single resolvable name"
+ )
return self.visit(node.value, **kwargs)
@@ -587,8 +678,7 @@ def visit_Attribute(self, node, **kwargs):
if isinstance(value, ast.Name) and value.id == attr:
return resolved
- raise ValueError("Invalid Attribute context {name}"
- .format(name=ctx.__name__))
+ raise ValueError("Invalid Attribute context {name}".format(name=ctx.__name__))
def visit_Call(self, node, side=None, **kwargs):
@@ -608,9 +698,8 @@ def visit_Call(self, node, side=None, **kwargs):
raise
if res is None:
- raise ValueError("Invalid function call {func}"
- .format(func=node.func.id))
- if hasattr(res, 'value'):
+ raise ValueError("Invalid function call {func}".format(func=node.func.id))
+ if hasattr(res, "value"):
res = res.value
if isinstance(res, FuncNode):
@@ -618,8 +707,10 @@ def visit_Call(self, node, side=None, **kwargs):
new_args = [self.visit(arg) for arg in node.args]
if node.keywords:
- raise TypeError("Function \"{name}\" does not support keyword "
- "arguments".format(name=res.name))
+ raise TypeError(
+ 'Function "{name}" does not support keyword '
+ "arguments".format(name=res.name)
+ )
return res(*new_args, **kwargs)
@@ -629,8 +720,10 @@ def visit_Call(self, node, side=None, **kwargs):
for key in node.keywords:
if not isinstance(key, ast.keyword):
- raise ValueError("keyword error in function call "
- "'{func}'".format(func=node.func.id))
+ raise ValueError(
+ "keyword error in function call "
+ "'{func}'".format(func=node.func.id)
+ )
if key.arg:
kwargs[key.arg] = self.visit(key.value).value
@@ -654,8 +747,9 @@ def visit_Compare(self, node, **kwargs):
left = node.left
values = []
for op, comp in zip(ops, comps):
- new_node = self.visit(ast.Compare(comparators=[comp], left=left,
- ops=[self.translate_In(op)]))
+ new_node = self.visit(
+ ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)])
+ )
left = comp
values.append(new_node)
return self.visit(ast.BoolOp(op=ast.And(), values=values))
@@ -670,33 +764,39 @@ def visitor(x, y):
lhs = self._try_visit_binop(x)
rhs = self._try_visit_binop(y)
- op, op_class, lhs, rhs = self._maybe_transform_eq_ne(
- node, lhs, rhs)
+ op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs)
return self._maybe_evaluate_binop(op, node.op, lhs, rhs)
operands = node.values
return reduce(visitor, operands)
-_python_not_supported = frozenset(['Dict', 'BoolOp', 'In', 'NotIn'])
+_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"])
_numexpr_supported_calls = frozenset(_reductions + _mathops)
-@disallow((_unsupported_nodes | _python_not_supported) -
- (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn',
- 'Tuple'])))
+@disallow(
+ (_unsupported_nodes | _python_not_supported)
+ - (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"]))
+)
class PandasExprVisitor(BaseExprVisitor):
-
- def __init__(self, env, engine, parser,
- preparser=partial(_preparse, f=_compose(
- _replace_locals, _replace_booleans,
- _clean_spaces_backtick_quoted_names))):
+ def __init__(
+ self,
+ env,
+ engine,
+ parser,
+ preparser=partial(
+ _preparse,
+ f=_compose(
+ _replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names
+ ),
+ ),
+ ):
super().__init__(env, engine, parser, preparser)
-@disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not']))
+@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"]))
class PythonExprVisitor(BaseExprVisitor):
-
def __init__(self, env, engine, parser, preparser=lambda x: x):
super().__init__(env, engine, parser, preparser=preparser)
@@ -715,19 +815,20 @@ class Expr(StringMixin):
level : int, optional, default 2
"""
- def __init__(self, expr, engine='numexpr', parser='pandas', env=None,
- truediv=True, level=0):
+ def __init__(
+ self, expr, engine="numexpr", parser="pandas", env=None, truediv=True, level=0
+ ):
self.expr = expr
self.env = env or Scope(level=level + 1)
self.engine = engine
self.parser = parser
- self.env.scope['truediv'] = truediv
+ self.env.scope["truediv"] = truediv
self._visitor = _parsers[parser](self.env, self.engine, self.parser)
self.terms = self.parse()
@property
def assigner(self):
- return getattr(self._visitor, 'assigner', None)
+ return getattr(self._visitor, "assigner", None)
def __call__(self):
return self.terms(self.env)
@@ -750,4 +851,4 @@ def names(self):
return frozenset(term.name for term in com.flatten(self.terms))
-_parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor}
+_parsers = {"python": PythonExprVisitor, "pandas": PandasExprVisitor}
diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py
index b01000a7aee5b..dc4e6e85f6e7d 100644
--- a/pandas/core/computation/expressions.py
+++ b/pandas/core/computation/expressions.py
@@ -28,8 +28,8 @@
# the set of dtypes that we will allow pass to numexpr
_ALLOWED_DTYPES = {
- 'evaluate': {'int64', 'int32', 'float64', 'float32', 'bool'},
- 'where': {'int64', 'float64', 'bool'}
+ "evaluate": {"int64", "int32", "float64", "float32", "bool"},
+ "where": {"int64", "float64", "bool"},
}
# the minimum prod shape that we will use numexpr
@@ -65,7 +65,7 @@ def _evaluate_standard(op, op_str, a, b, **eval_kwargs):
""" standard evaluation """
if _TEST_MODE:
_store_test_result(False)
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
return op(a, b)
@@ -79,7 +79,7 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check):
# check for dtype compatibility
dtypes = set()
for o in [a, b]:
- if hasattr(o, 'dtypes'):
+ if hasattr(o, "dtypes"):
s = o.dtypes.value_counts()
if len(s) > 1:
return False
@@ -94,11 +94,10 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check):
return False
-def _evaluate_numexpr(op, op_str, a, b, truediv=True,
- reversed=False, **eval_kwargs):
+def _evaluate_numexpr(op, op_str, a, b, truediv=True, reversed=False, **eval_kwargs):
result = None
- if _can_use_numexpr(op, op_str, a, b, 'evaluate'):
+ if _can_use_numexpr(op, op_str, a, b, "evaluate"):
try:
# we were originally called by a reversed op
@@ -108,13 +107,15 @@ def _evaluate_numexpr(op, op_str, a, b, truediv=True,
a_value = getattr(a, "values", a)
b_value = getattr(b, "values", b)
- result = ne.evaluate('a_value {op} b_value'.format(op=op_str),
- local_dict={'a_value': a_value,
- 'b_value': b_value},
- casting='safe', truediv=truediv,
- **eval_kwargs)
+ result = ne.evaluate(
+ "a_value {op} b_value".format(op=op_str),
+ local_dict={"a_value": a_value, "b_value": b_value},
+ casting="safe",
+ truediv=truediv,
+ **eval_kwargs
+ )
except ValueError as detail:
- if 'unknown type object' in str(detail):
+ if "unknown type object" in str(detail):
pass
if _TEST_MODE:
@@ -127,26 +128,33 @@ def _evaluate_numexpr(op, op_str, a, b, truediv=True,
def _where_standard(cond, a, b):
- return np.where(com.values_from_object(cond), com.values_from_object(a),
- com.values_from_object(b))
+ return np.where(
+ com.values_from_object(cond),
+ com.values_from_object(a),
+ com.values_from_object(b),
+ )
def _where_numexpr(cond, a, b):
result = None
- if _can_use_numexpr(None, 'where', a, b, 'where'):
+ if _can_use_numexpr(None, "where", a, b, "where"):
try:
- cond_value = getattr(cond, 'values', cond)
- a_value = getattr(a, 'values', a)
- b_value = getattr(b, 'values', b)
- result = ne.evaluate('where(cond_value, a_value, b_value)',
- local_dict={'cond_value': cond_value,
- 'a_value': a_value,
- 'b_value': b_value},
- casting='safe')
+ cond_value = getattr(cond, "values", cond)
+ a_value = getattr(a, "values", a)
+ b_value = getattr(b, "values", b)
+ result = ne.evaluate(
+ "where(cond_value, a_value, b_value)",
+ local_dict={
+ "cond_value": cond_value,
+ "a_value": a_value,
+ "b_value": b_value,
+ },
+ casting="safe",
+ )
except ValueError as detail:
- if 'unknown type object' in str(detail):
+ if "unknown type object" in str(detail):
pass
except Exception as detail:
raise TypeError(str(detail))
@@ -158,40 +166,44 @@ def _where_numexpr(cond, a, b):
# turn myself on
-set_use_numexpr(get_option('compute.use_numexpr'))
+set_use_numexpr(get_option("compute.use_numexpr"))
def _has_bool_dtype(x):
try:
if isinstance(x, ABCDataFrame):
- return 'bool' in x.dtypes
+ return "bool" in x.dtypes
else:
return x.dtype == bool
except AttributeError:
return isinstance(x, (bool, np.bool_))
-def _bool_arith_check(op_str, a, b, not_allowed=frozenset(('/', '//', '**')),
- unsupported=None):
+def _bool_arith_check(
+ op_str, a, b, not_allowed=frozenset(("/", "//", "**")), unsupported=None
+):
if unsupported is None:
- unsupported = {'+': '|', '*': '&', '-': '^'}
+ unsupported = {"+": "|", "*": "&", "-": "^"}
if _has_bool_dtype(a) and _has_bool_dtype(b):
if op_str in unsupported:
- warnings.warn("evaluating in Python space because the {op!r} "
- "operator is not supported by numexpr for "
- "the bool dtype, use {alt_op!r} instead"
- .format(op=op_str, alt_op=unsupported[op_str]))
+ warnings.warn(
+ "evaluating in Python space because the {op!r} "
+ "operator is not supported by numexpr for "
+ "the bool dtype, use {alt_op!r} instead".format(
+ op=op_str, alt_op=unsupported[op_str]
+ )
+ )
return False
if op_str in not_allowed:
- raise NotImplementedError("operator {op!r} not implemented for "
- "bool dtypes".format(op=op_str))
+ raise NotImplementedError(
+ "operator {op!r} not implemented for " "bool dtypes".format(op=op_str)
+ )
return True
-def evaluate(op, op_str, a, b, use_numexpr=True,
- **eval_kwargs):
+def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs):
""" evaluate and return the expression of the op on a and b
Parameters
diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py
index fd96739f4da76..9e6928372808e 100644
--- a/pandas/core/computation/ops.py
+++ b/pandas/core/computation/ops.py
@@ -19,19 +19,36 @@
from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded
-_reductions = 'sum', 'prod'
-
-_unary_math_ops = ('sin', 'cos', 'exp', 'log', 'expm1', 'log1p',
- 'sqrt', 'sinh', 'cosh', 'tanh', 'arcsin', 'arccos',
- 'arctan', 'arccosh', 'arcsinh', 'arctanh', 'abs', 'log10',
- 'floor', 'ceil'
- )
-_binary_math_ops = ('arctan2',)
+_reductions = "sum", "prod"
+
+_unary_math_ops = (
+ "sin",
+ "cos",
+ "exp",
+ "log",
+ "expm1",
+ "log1p",
+ "sqrt",
+ "sinh",
+ "cosh",
+ "tanh",
+ "arcsin",
+ "arccos",
+ "arctan",
+ "arccosh",
+ "arcsinh",
+ "arctanh",
+ "abs",
+ "log10",
+ "floor",
+ "ceil",
+)
+_binary_math_ops = ("arctan2",)
_mathops = _unary_math_ops + _binary_math_ops
-_LOCAL_TAG = '__pd_eval_local_'
+_LOCAL_TAG = "__pd_eval_local_"
class UndefinedVariableError(NameError):
@@ -40,14 +57,13 @@ class UndefinedVariableError(NameError):
def __init__(self, name, is_local):
if is_local:
- msg = 'local variable {0!r} is not defined'
+ msg = "local variable {0!r} is not defined"
else:
- msg = 'name {0!r} is not defined'
+ msg = "name {0!r} is not defined"
super().__init__(msg.format(name))
class Term(StringMixin):
-
def __new__(cls, name, env, side=None, encoding=None):
klass = Constant if not isinstance(name, str) else cls
supr_new = super(Term, klass).__new__
@@ -58,14 +74,13 @@ def __init__(self, name, env, side=None, encoding=None):
self.env = env
self.side = side
tname = str(name)
- self.is_local = (tname.startswith(_LOCAL_TAG) or
- tname in _DEFAULT_GLOBALS)
+ self.is_local = tname.startswith(_LOCAL_TAG) or tname in _DEFAULT_GLOBALS
self._value = self._resolve_name()
self.encoding = encoding
@property
def local_name(self):
- return self.name.replace(_LOCAL_TAG, '')
+ return self.name.replace(_LOCAL_TAG, "")
def __str__(self):
return pprint_thing(self.name)
@@ -80,9 +95,10 @@ def _resolve_name(self):
res = self.env.resolve(self.local_name, is_local=self.is_local)
self.update(res)
- if hasattr(res, 'ndim') and res.ndim > 2:
- raise NotImplementedError("N-dimensional objects, where N > 2,"
- " are not supported with eval")
+ if hasattr(res, "ndim") and res.ndim > 2:
+ raise NotImplementedError(
+ "N-dimensional objects, where N > 2," " are not supported with eval"
+ )
return res
def update(self, value):
@@ -124,9 +140,10 @@ def type(self):
@property
def raw(self):
- return pprint_thing('{0}(name={1!r}, type={2})'
- ''.format(self.__class__.__name__, self.name,
- self.type))
+ return pprint_thing(
+ "{0}(name={1!r}, type={2})"
+ "".format(self.__class__.__name__, self.name, self.type)
+ )
@property
def is_datetime(self):
@@ -155,7 +172,6 @@ def ndim(self):
class Constant(Term):
-
def __init__(self, value, env, side=None, encoding=None):
super().__init__(value, env, side=side, encoding=encoding)
@@ -172,7 +188,7 @@ def __str__(self):
return repr(self.name)
-_bool_op_map = {'not': '~', 'and': '&', 'or': '|'}
+_bool_op_map = {"not": "~", "and": "&", "or": "|"}
class Op(StringMixin):
@@ -183,7 +199,7 @@ class Op(StringMixin):
def __init__(self, op, operands, *args, **kwargs):
self.op = _bool_op_map.get(op, op)
self.operands = operands
- self.encoding = kwargs.get('encoding', None)
+ self.encoding = kwargs.get("encoding", None)
def __iter__(self):
return iter(self.operands)
@@ -192,9 +208,8 @@ def __str__(self):
"""Print a generic n-ary operator and its operands using infix
notation"""
# recurse over the operands
- parened = ('({0})'.format(pprint_thing(opr))
- for opr in self.operands)
- return pprint_thing(' {0} '.format(self.op).join(parened))
+ parened = ("({0})".format(pprint_thing(opr)) for opr in self.operands)
+ return pprint_thing(" {0} ".format(self.op).join(parened))
@property
def return_type(self):
@@ -206,7 +221,7 @@ def return_type(self):
@property
def has_invalid_return_type(self):
types = self.operand_types
- obj_dtype_set = frozenset([np.dtype('object')])
+ obj_dtype_set = frozenset([np.dtype("object")])
return self.return_type == object and types - obj_dtype_set
@property
@@ -257,23 +272,23 @@ def _not_in(x, y):
return x not in y
-_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', 'in', 'not in'
+_cmp_ops_syms = ">", "<", ">=", "<=", "==", "!=", "in", "not in"
_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, _in, _not_in
_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs))
-_bool_ops_syms = '&', '|', 'and', 'or'
+_bool_ops_syms = "&", "|", "and", "or"
_bool_ops_funcs = op.and_, op.or_, op.and_, op.or_
_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs))
-_arith_ops_syms = '+', '-', '*', '/', '**', '//', '%'
-_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv, op.pow, op.floordiv,
- op.mod)
+_arith_ops_syms = "+", "-", "*", "/", "**", "//", "%"
+_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv, op.pow, op.floordiv, op.mod)
_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs))
-_special_case_arith_ops_syms = '**', '//', '%'
+_special_case_arith_ops_syms = "**", "//", "%"
_special_case_arith_ops_funcs = op.pow, op.floordiv, op.mod
-_special_case_arith_ops_dict = dict(zip(_special_case_arith_ops_syms,
- _special_case_arith_ops_funcs))
+_special_case_arith_ops_dict = dict(
+ zip(_special_case_arith_ops_syms, _special_case_arith_ops_funcs)
+)
_binary_ops_dict = {}
@@ -337,8 +352,10 @@ def __init__(self, op, lhs, rhs, **kwargs):
except KeyError:
# has to be made a list for python3
keys = list(_binary_ops_dict.keys())
- raise ValueError('Invalid binary operator {0!r}, valid'
- ' operators are {1}'.format(op, keys))
+ raise ValueError(
+ "Invalid binary operator {0!r}, valid"
+ " operators are {1}".format(op, keys)
+ )
def __call__(self, env):
"""Recursively evaluate an expression in Python space.
@@ -353,7 +370,7 @@ def __call__(self, env):
The result of an evaluated expression.
"""
# handle truediv
- if self.op == '/' and env.scope['truediv']:
+ if self.op == "/" and env.scope["truediv"]:
self.func = op.truediv
# recurse over the left/right nodes
@@ -378,24 +395,32 @@ def evaluate(self, env, engine, parser, term_type, eval_in_python):
term_type
The "pre-evaluated" expression as an instance of ``term_type``
"""
- if engine == 'python':
+ if engine == "python":
res = self(env)
else:
# recurse over the left/right nodes
- left = self.lhs.evaluate(env, engine=engine, parser=parser,
- term_type=term_type,
- eval_in_python=eval_in_python)
- right = self.rhs.evaluate(env, engine=engine, parser=parser,
- term_type=term_type,
- eval_in_python=eval_in_python)
+ left = self.lhs.evaluate(
+ env,
+ engine=engine,
+ parser=parser,
+ term_type=term_type,
+ eval_in_python=eval_in_python,
+ )
+ right = self.rhs.evaluate(
+ env,
+ engine=engine,
+ parser=parser,
+ term_type=term_type,
+ eval_in_python=eval_in_python,
+ )
# base cases
if self.op in eval_in_python:
res = self.func(left.value, right.value)
else:
from pandas.core.computation.eval import eval
- res = eval(self, local_dict=env, engine=engine,
- parser=parser)
+
+ res = eval(self, local_dict=env, engine=engine, parser=parser)
name = env.add_tmp(res)
return term_type(name, env=env)
@@ -403,10 +428,10 @@ def evaluate(self, env, engine, parser, term_type, eval_in_python):
def convert_values(self):
"""Convert datetimes to a comparable value in an expression.
"""
+
def stringify(value):
if self.encoding is not None:
- encoder = partial(pprint_thing_encoded,
- encoding=self.encoding)
+ encoder = partial(pprint_thing_encoded, encoding=self.encoding)
else:
encoder = pprint_thing
return encoder(value)
@@ -419,7 +444,7 @@ def stringify(value):
v = stringify(v)
v = Timestamp(_ensure_decoded(v))
if v.tz is not None:
- v = v.tz_convert('UTC')
+ v = v.tz_convert("UTC")
self.rhs.update(v)
if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar:
@@ -428,14 +453,20 @@ def stringify(value):
v = stringify(v)
v = Timestamp(_ensure_decoded(v))
if v.tz is not None:
- v = v.tz_convert('UTC')
+ v = v.tz_convert("UTC")
self.lhs.update(v)
def _disallow_scalar_only_bool_ops(self):
- if ((self.lhs.is_scalar or self.rhs.is_scalar) and
- self.op in _bool_ops_dict and
- (not (issubclass(self.rhs.return_type, (bool, np.bool_)) and
- issubclass(self.lhs.return_type, (bool, np.bool_))))):
+ if (
+ (self.lhs.is_scalar or self.rhs.is_scalar)
+ and self.op in _bool_ops_dict
+ and (
+ not (
+ issubclass(self.rhs.return_type, (bool, np.bool_))
+ and issubclass(self.lhs.return_type, (bool, np.bool_))
+ )
+ )
+ ):
raise NotImplementedError("cannot evaluate scalar only bool ops")
@@ -457,20 +488,20 @@ class Div(BinOp):
"""
def __init__(self, lhs, rhs, truediv, *args, **kwargs):
- super().__init__('/', lhs, rhs, *args, **kwargs)
+ super().__init__("/", lhs, rhs, *args, **kwargs)
if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type):
- raise TypeError("unsupported operand type(s) for {0}:"
- " '{1}' and '{2}'".format(self.op,
- lhs.return_type,
- rhs.return_type))
+ raise TypeError(
+ "unsupported operand type(s) for {0}:"
+ " '{1}' and '{2}'".format(self.op, lhs.return_type, rhs.return_type)
+ )
# do not upcast float32s to float64 un-necessarily
acceptable_dtypes = [np.float32, np.float_]
_cast_inplace(com.flatten(self), acceptable_dtypes, np.float_)
-_unary_ops_syms = '+', '-', '~', 'not'
+_unary_ops_syms = "+", "-", "~", "not"
_unary_ops_funcs = op.pos, op.neg, op.invert, op.invert
_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs))
@@ -499,54 +530,55 @@ def __init__(self, op, operand):
try:
self.func = _unary_ops_dict[op]
except KeyError:
- raise ValueError('Invalid unary operator {0!r}, valid operators '
- 'are {1}'.format(op, _unary_ops_syms))
+ raise ValueError(
+ "Invalid unary operator {0!r}, valid operators "
+ "are {1}".format(op, _unary_ops_syms)
+ )
def __call__(self, env):
operand = self.operand(env)
return self.func(operand)
def __str__(self):
- return pprint_thing('{0}({1})'.format(self.op, self.operand))
+ return pprint_thing("{0}({1})".format(self.op, self.operand))
@property
def return_type(self):
operand = self.operand
- if operand.return_type == np.dtype('bool'):
- return np.dtype('bool')
- if (isinstance(operand, Op) and
- (operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict)):
- return np.dtype('bool')
- return np.dtype('int')
+ if operand.return_type == np.dtype("bool"):
+ return np.dtype("bool")
+ if isinstance(operand, Op) and (
+ operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict
+ ):
+ return np.dtype("bool")
+ return np.dtype("int")
class MathCall(Op):
-
def __init__(self, func, args):
super().__init__(func.name, args)
self.func = func
def __call__(self, env):
operands = [op(env) for op in self.operands]
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
return self.func.func(*operands)
def __str__(self):
operands = map(str, self.operands)
- return pprint_thing('{0}({1})'.format(self.op, ','.join(operands)))
+ return pprint_thing("{0}({1})".format(self.op, ",".join(operands)))
class FuncNode:
def __init__(self, name):
- from pandas.core.computation.check import (_NUMEXPR_INSTALLED,
- _NUMEXPR_VERSION)
+ from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION
+
if name not in _mathops or (
- _NUMEXPR_INSTALLED and
- _NUMEXPR_VERSION < LooseVersion('2.6.9') and
- name in ('floor', 'ceil')
+ _NUMEXPR_INSTALLED
+ and _NUMEXPR_VERSION < LooseVersion("2.6.9")
+ and name in ("floor", "ceil")
):
- raise ValueError(
- "\"{0}\" is not a supported function".format(name))
+ raise ValueError('"{0}" is not a supported function'.format(name))
self.name = name
self.func = getattr(np, name)
diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
index 25cfa8fe17697..e4e005c024345 100644
--- a/pandas/core/computation/pytables.py
+++ b/pandas/core/computation/pytables.py
@@ -22,18 +22,14 @@
class Scope(expr.Scope):
- __slots__ = 'queryables',
+ __slots__ = ("queryables",)
- def __init__(self, level, global_dict=None, local_dict=None,
- queryables=None):
- super().__init__(level + 1,
- global_dict=global_dict,
- local_dict=local_dict)
+ def __init__(self, level, global_dict=None, local_dict=None, queryables=None):
+ super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict)
self.queryables = queryables or dict()
class Term(ops.Term):
-
def __new__(cls, name, env, side=None, encoding=None):
klass = Constant if not isinstance(name, str) else cls
supr_new = StringMixin.__new__
@@ -44,10 +40,9 @@ def __init__(self, name, env, side=None, encoding=None):
def _resolve_name(self):
# must be a queryables
- if self.side == 'left':
+ if self.side == "left":
if self.name not in self.env.queryables:
- raise NameError('name {name!r} is not defined'
- .format(name=self.name))
+ raise NameError("name {name!r} is not defined".format(name=self.name))
return self.name
# resolve the rhs (and allow it to be None)
@@ -63,7 +58,6 @@ def value(self):
class Constant(Term):
-
def __init__(self, value, env, side=None, encoding=None):
super().__init__(value, env, side=side, encoding=encoding)
@@ -86,7 +80,6 @@ def _disallow_scalar_only_bool_ops(self):
pass
def prune(self, klass):
-
def pr(left, right):
""" create and return a new specialized BinOp from myself """
@@ -97,8 +90,9 @@ def pr(left, right):
k = klass
if isinstance(left, ConditionBinOp):
- if (isinstance(left, ConditionBinOp) and
- isinstance(right, ConditionBinOp)):
+ if isinstance(left, ConditionBinOp) and isinstance(
+ right, ConditionBinOp
+ ):
k = JointConditionBinOp
elif isinstance(left, k):
return left
@@ -106,16 +100,16 @@ def pr(left, right):
return right
elif isinstance(left, FilterBinOp):
- if (isinstance(left, FilterBinOp) and
- isinstance(right, FilterBinOp)):
+ if isinstance(left, FilterBinOp) and isinstance(right, FilterBinOp):
k = JointFilterBinOp
elif isinstance(left, k):
return left
elif isinstance(right, k):
return right
- return k(self.op, left, right, queryables=self.queryables,
- encoding=self.encoding).evaluate()
+ return k(
+ self.op, left, right, queryables=self.queryables, encoding=self.encoding
+ ).evaluate()
left, right = self.lhs, self.rhs
@@ -152,17 +146,17 @@ def is_in_table(self):
@property
def kind(self):
""" the kind of my field """
- return getattr(self.queryables.get(self.lhs), 'kind', None)
+ return getattr(self.queryables.get(self.lhs), "kind", None)
@property
def meta(self):
""" the meta of my field """
- return getattr(self.queryables.get(self.lhs), 'meta', None)
+ return getattr(self.queryables.get(self.lhs), "meta", None)
@property
def metadata(self):
""" the metadata of my field """
- return getattr(self.queryables.get(self.lhs), 'metadata', None)
+ return getattr(self.queryables.get(self.lhs), "metadata", None)
def generate(self, v):
""" create and return the op string for this TermValue """
@@ -175,64 +169,74 @@ def convert_value(self, v):
def stringify(value):
if self.encoding is not None:
- encoder = partial(pprint_thing_encoded,
- encoding=self.encoding)
+ encoder = partial(pprint_thing_encoded, encoding=self.encoding)
else:
encoder = pprint_thing
return encoder(value)
kind = _ensure_decoded(self.kind)
meta = _ensure_decoded(self.meta)
- if kind == 'datetime64' or kind == 'datetime':
+ if kind == "datetime64" or kind == "datetime":
if isinstance(v, (int, float)):
v = stringify(v)
v = _ensure_decoded(v)
v = Timestamp(v)
if v.tz is not None:
- v = v.tz_convert('UTC')
+ v = v.tz_convert("UTC")
return TermValue(v, v.value, kind)
- elif kind == 'timedelta64' or kind == 'timedelta':
- v = Timedelta(v, unit='s').value
+ elif kind == "timedelta64" or kind == "timedelta":
+ v = Timedelta(v, unit="s").value
return TermValue(int(v), v, kind)
- elif meta == 'category':
+ elif meta == "category":
metadata = com.values_from_object(self.metadata)
- result = metadata.searchsorted(v, side='left')
+ result = metadata.searchsorted(v, side="left")
# result returns 0 if v is first element or if v is not in metadata
# check that metadata contains v
if not result and v not in metadata:
result = -1
- return TermValue(result, result, 'integer')
- elif kind == 'integer':
+ return TermValue(result, result, "integer")
+ elif kind == "integer":
v = int(float(v))
return TermValue(v, v, kind)
- elif kind == 'float':
+ elif kind == "float":
v = float(v)
return TermValue(v, v, kind)
- elif kind == 'bool':
+ elif kind == "bool":
if isinstance(v, str):
- v = not v.strip().lower() in ['false', 'f', 'no',
- 'n', 'none', '0',
- '[]', '{}', '']
+ v = not v.strip().lower() in [
+ "false",
+ "f",
+ "no",
+ "n",
+ "none",
+ "0",
+ "[]",
+ "{}",
+ "",
+ ]
else:
v = bool(v)
return TermValue(v, v, kind)
elif isinstance(v, str):
# string quoting
- return TermValue(v, stringify(v), 'string')
+ return TermValue(v, stringify(v), "string")
else:
- raise TypeError("Cannot compare {v} of type {typ} to {kind} column"
- .format(v=v, typ=type(v), kind=kind))
+ raise TypeError(
+ "Cannot compare {v} of type {typ} to {kind} column".format(
+ v=v, typ=type(v), kind=kind
+ )
+ )
def convert_values(self):
pass
class FilterBinOp(BinOp):
-
def __str__(self):
- return pprint_thing("[Filter : [{lhs}] -> [{op}]"
- .format(lhs=self.filter[0], op=self.filter[1]))
+ return pprint_thing(
+ "[Filter : [{lhs}] -> [{op}]".format(lhs=self.filter[0], op=self.filter[1])
+ )
def invert(self):
""" invert the filter """
@@ -249,8 +253,7 @@ def format(self):
def evaluate(self):
if not self.is_valid:
- raise ValueError("query term is not valid [{slf}]"
- .format(slf=self))
+ raise ValueError("query term is not valid [{slf}]".format(slf=self))
rhs = self.conform(self.rhs)
values = [TermValue(v, v, self.kind).value for v in rhs]
@@ -258,41 +261,36 @@ def evaluate(self):
if self.is_in_table:
# if too many values to create the expression, use a filter instead
- if self.op in ['==', '!='] and len(values) > self._max_selectors:
+ if self.op in ["==", "!="] and len(values) > self._max_selectors:
filter_op = self.generate_filter_op()
- self.filter = (
- self.lhs,
- filter_op,
- pd.Index(values))
+ self.filter = (self.lhs, filter_op, pd.Index(values))
return self
return None
# equality conditions
- if self.op in ['==', '!=']:
+ if self.op in ["==", "!="]:
filter_op = self.generate_filter_op()
- self.filter = (
- self.lhs,
- filter_op,
- pd.Index(values))
+ self.filter = (self.lhs, filter_op, pd.Index(values))
else:
- raise TypeError("passing a filterable condition to a non-table "
- "indexer [{slf}]".format(slf=self))
+ raise TypeError(
+ "passing a filterable condition to a non-table "
+ "indexer [{slf}]".format(slf=self)
+ )
return self
def generate_filter_op(self, invert=False):
- if (self.op == '!=' and not invert) or (self.op == '==' and invert):
+ if (self.op == "!=" and not invert) or (self.op == "==" and invert):
return lambda axis, vals: ~axis.isin(vals)
else:
return lambda axis, vals: axis.isin(vals)
class JointFilterBinOp(FilterBinOp):
-
def format(self):
raise NotImplementedError("unable to collapse Joint Filters")
@@ -301,18 +299,17 @@ def evaluate(self):
class ConditionBinOp(BinOp):
-
def __str__(self):
- return pprint_thing("[Condition : [{cond}]]"
- .format(cond=self.condition))
+ return pprint_thing("[Condition : [{cond}]]".format(cond=self.condition))
def invert(self):
""" invert the condition """
# if self.condition is not None:
# self.condition = "~(%s)" % self.condition
# return self
- raise NotImplementedError("cannot use an invert condition when "
- "passing to numexpr")
+ raise NotImplementedError(
+ "cannot use an invert condition when " "passing to numexpr"
+ )
def format(self):
""" return the actual ne format """
@@ -321,8 +318,7 @@ def format(self):
def evaluate(self):
if not self.is_valid:
- raise ValueError("query term is not valid [{slf}]"
- .format(slf=self))
+ raise ValueError("query term is not valid [{slf}]".format(slf=self))
# convert values if we are in the table
if not self.is_in_table:
@@ -332,12 +328,12 @@ def evaluate(self):
values = [self.convert_value(v) for v in rhs]
# equality conditions
- if self.op in ['==', '!=']:
+ if self.op in ["==", "!="]:
# too many values to create the expression?
if len(values) <= self._max_selectors:
vs = [self.generate(v) for v in values]
- self.condition = "({cond})".format(cond=' | '.join(vs))
+ self.condition = "({cond})".format(cond=" | ".join(vs))
# use a filter after reading
else:
@@ -349,19 +345,17 @@ def evaluate(self):
class JointConditionBinOp(ConditionBinOp):
-
def evaluate(self):
- self.condition = "({lhs} {op} {rhs})".format(lhs=self.lhs.condition,
- op=self.op,
- rhs=self.rhs.condition)
+ self.condition = "({lhs} {op} {rhs})".format(
+ lhs=self.lhs.condition, op=self.op, rhs=self.rhs.condition
+ )
return self
class UnaryOp(ops.UnaryOp):
-
def prune(self, klass):
- if self.op != '~':
+ if self.op != "~":
raise NotImplementedError("UnaryOp only support invert type ops")
operand = self.operand
@@ -378,7 +372,7 @@ def prune(self, klass):
return None
-_op_classes = {'unary': UnaryOp}
+_op_classes = {"unary": UnaryOp}
class ExprVisitor(BaseExprVisitor):
@@ -389,24 +383,27 @@ def __init__(self, env, engine, parser, **kwargs):
super().__init__(env, engine, parser)
for bin_op in self.binary_ops:
bin_node = self.binary_op_nodes_map[bin_op]
- setattr(self, 'visit_{node}'.format(node=bin_node),
- lambda node, bin_op=bin_op: partial(BinOp, bin_op,
- **kwargs))
+ setattr(
+ self,
+ "visit_{node}".format(node=bin_node),
+ lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs),
+ )
def visit_UnaryOp(self, node, **kwargs):
if isinstance(node.op, (ast.Not, ast.Invert)):
- return UnaryOp('~', self.visit(node.operand))
+ return UnaryOp("~", self.visit(node.operand))
elif isinstance(node.op, ast.USub):
return self.const_type(-self.visit(node.operand).value, self.env)
elif isinstance(node.op, ast.UAdd):
- raise NotImplementedError('Unary addition not supported')
+ raise NotImplementedError("Unary addition not supported")
def visit_Index(self, node, **kwargs):
return self.visit(node.value).value
def visit_Assign(self, node, **kwargs):
- cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0],
- comparators=[node.value])
+ cmpr = ast.Compare(
+ ops=[ast.Eq()], left=node.targets[0], comparators=[node.value]
+ )
return self.visit(cmpr)
def visit_Subscript(self, node, **kwargs):
@@ -422,8 +419,10 @@ def visit_Subscript(self, node, **kwargs):
try:
return self.const_type(value[slobj], self.env)
except TypeError:
- raise ValueError("cannot subscript {value!r} with "
- "{slobj!r}".format(value=value, slobj=slobj))
+ raise ValueError(
+ "cannot subscript {value!r} with "
+ "{slobj!r}".format(value=value, slobj=slobj)
+ )
def visit_Attribute(self, node, **kwargs):
attr = node.attr
@@ -448,8 +447,7 @@ def visit_Attribute(self, node, **kwargs):
if isinstance(value, ast.Name) and value.id == attr:
return resolved
- raise ValueError("Invalid Attribute context {name}"
- .format(name=ctx.__name__))
+ raise ValueError("Invalid Attribute context {name}".format(name=ctx.__name__))
def translate_In(self, op):
return ast.Eq() if isinstance(op, ast.In) else op
@@ -478,8 +476,9 @@ def _validate_where(w):
"""
if not (isinstance(w, (Expr, str)) or is_list_like(w)):
- raise TypeError("where must be passed as a string, Expr, "
- "or list-like of Exprs")
+ raise TypeError(
+ "where must be passed as a string, Expr, " "or list-like of Exprs"
+ )
return w
@@ -537,16 +536,20 @@ def __init__(self, where, queryables=None, encoding=None, scope_level=0):
else:
w = _validate_where(w)
where[idx] = w
- where = ' & '.join(map('({})'.format, com.flatten(where))) # noqa
+ where = " & ".join(map("({})".format, com.flatten(where))) # noqa
self.expr = where
self.env = Scope(scope_level + 1, local_dict=local_dict)
if queryables is not None and isinstance(self.expr, str):
self.env.queryables.update(queryables)
- self._visitor = ExprVisitor(self.env, queryables=queryables,
- parser='pytables', engine='pytables',
- encoding=encoding)
+ self._visitor = ExprVisitor(
+ self.env,
+ queryables=queryables,
+ parser="pytables",
+ engine="pytables",
+ encoding=encoding,
+ )
self.terms = self.parse()
def __str__(self):
@@ -560,15 +563,17 @@ def evaluate(self):
try:
self.condition = self.terms.prune(ConditionBinOp)
except AttributeError:
- raise ValueError("cannot process expression [{expr}], [{slf}] "
- "is not a valid condition".format(expr=self.expr,
- slf=self))
+ raise ValueError(
+ "cannot process expression [{expr}], [{slf}] "
+ "is not a valid condition".format(expr=self.expr, slf=self)
+ )
try:
self.filter = self.terms.prune(FilterBinOp)
except AttributeError:
- raise ValueError("cannot process expression [{expr}], [{slf}] "
- "is not a valid filter".format(expr=self.expr,
- slf=self))
+ raise ValueError(
+ "cannot process expression [{expr}], [{slf}] "
+ "is not a valid filter".format(expr=self.expr, slf=self)
+ )
return self.condition, self.filter
@@ -585,11 +590,11 @@ def __init__(self, value, converted, kind):
def tostring(self, encoding):
""" quote the string if not encoded
else encode and return """
- if self.kind == 'string':
+ if self.kind == "string":
if encoding is not None:
return self.converted
return '"{converted}"'.format(converted=self.converted)
- elif self.kind == 'float':
+ elif self.kind == "float":
# python 2 str(float) is not always
# round-trippable so use repr()
return repr(self.converted)
@@ -600,7 +605,7 @@ def maybe_expression(s):
""" loose checking if s is a pytables-acceptable expression """
if not isinstance(s, str):
return False
- ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ('=',)
+ ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ("=",)
# make sure we have an op at least
return any(op in s for op in ops)
diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py
index 729acdc52e24a..4d5a523337f66 100644
--- a/pandas/core/computation/scope.py
+++ b/pandas/core/computation/scope.py
@@ -19,11 +19,17 @@
import pandas.core.computation as compu
-def _ensure_scope(level, global_dict=None, local_dict=None, resolvers=(),
- target=None, **kwargs):
+def _ensure_scope(
+ level, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs
+):
"""Ensure that we are grabbing the correct scope."""
- return Scope(level + 1, global_dict=global_dict, local_dict=local_dict,
- resolvers=resolvers, target=target)
+ return Scope(
+ level + 1,
+ global_dict=global_dict,
+ local_dict=local_dict,
+ resolvers=resolvers,
+ target=target,
+ )
def _replacer(x):
@@ -44,19 +50,19 @@ def _replacer(x):
def _raw_hex_id(obj):
"""Return the padded hexadecimal id of ``obj``."""
# interpret as a pointer since that's what really what id returns
- packed = struct.pack('@P', id(obj))
- return ''.join(map(_replacer, packed))
+ packed = struct.pack("@P", id(obj))
+ return "".join(map(_replacer, packed))
_DEFAULT_GLOBALS = {
- 'Timestamp': Timestamp,
- 'datetime': datetime.datetime,
- 'True': True,
- 'False': False,
- 'list': list,
- 'tuple': tuple,
- 'inf': np.inf,
- 'Inf': np.inf,
+ "Timestamp": Timestamp,
+ "datetime": datetime.datetime,
+ "True": True,
+ "False": False,
+ "list": list,
+ "tuple": tuple,
+ "inf": np.inf,
+ "Inf": np.inf,
}
@@ -98,10 +104,12 @@ class Scope(StringMixin):
target : object
temps : dict
"""
- __slots__ = 'level', 'scope', 'target', 'temps'
- def __init__(self, level, global_dict=None, local_dict=None, resolvers=(),
- target=None):
+ __slots__ = "level", "scope", "target", "temps"
+
+ def __init__(
+ self, level, global_dict=None, local_dict=None, resolvers=(), target=None
+ ):
self.level = level + 1
# shallow copy because we don't want to keep filling this up with what
@@ -121,11 +129,9 @@ def __init__(self, level, global_dict=None, local_dict=None, resolvers=(),
# shallow copy here because we don't want to replace what's in
# scope when we align terms (alignment accesses the underlying
# numpy array of pandas objects)
- self.scope = self.scope.new_child((global_dict or
- frame.f_globals).copy())
+ self.scope = self.scope.new_child((global_dict or frame.f_globals).copy())
if not isinstance(local_dict, Scope):
- self.scope = self.scope.new_child((local_dict or
- frame.f_locals).copy())
+ self.scope = self.scope.new_child((local_dict or frame.f_locals).copy())
finally:
del frame
@@ -138,10 +144,10 @@ def __init__(self, level, global_dict=None, local_dict=None, resolvers=(),
def __str__(self):
scope_keys = _get_pretty_string(list(self.scope.keys()))
res_keys = _get_pretty_string(list(self.resolvers.keys()))
- unicode_str = '{name}(scope={scope_keys}, resolvers={res_keys})'
- return unicode_str.format(name=type(self).__name__,
- scope_keys=scope_keys,
- res_keys=res_keys)
+ unicode_str = "{name}(scope={scope_keys}, resolvers={res_keys})"
+ return unicode_str.format(
+ name=type(self).__name__, scope_keys=scope_keys, res_keys=res_keys
+ )
@property
def has_resolvers(self):
@@ -232,7 +238,7 @@ def _get_vars(self, stack, scopes):
variables = itertools.product(scopes, stack)
for scope, (frame, _, _, _, _, _) in variables:
try:
- d = getattr(frame, 'f_' + scope)
+ d = getattr(frame, "f_" + scope)
self.scope = self.scope.new_child(d)
finally:
# won't remove it, but DECREF it
@@ -255,7 +261,7 @@ def update(self, level):
stack = inspect.stack()
try:
- self._get_vars(stack[:sl], scopes=['locals'])
+ self._get_vars(stack[:sl], scopes=["locals"])
finally:
del stack[:], stack
@@ -272,9 +278,9 @@ def add_tmp(self, value):
name : basestring
The name of the temporary variable created.
"""
- name = '{name}_{num}_{hex_id}'.format(name=type(value).__name__,
- num=self.ntemps,
- hex_id=_raw_hex_id(self))
+ name = "{name}_{num}_{hex_id}".format(
+ name=type(value).__name__, num=self.ntemps, hex_id=_raw_hex_id(self)
+ )
# add to inner most scope
assert name not in self.temps
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 856d5076f3755..be6086dd360f2 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -13,8 +13,13 @@
import pandas._config.config as cf
from pandas._config.config import (
- is_bool, is_callable, is_instance_factory, is_int, is_one_of_factory,
- is_text)
+ is_bool,
+ is_callable,
+ is_instance_factory,
+ is_int,
+ is_one_of_factory,
+ is_text,
+)
# compute
@@ -28,6 +33,7 @@
def use_bottleneck_cb(key):
from pandas.core import nanops
+
nanops.set_use_bottleneck(cf.get_option(key))
@@ -41,14 +47,21 @@ def use_bottleneck_cb(key):
def use_numexpr_cb(key):
from pandas.core.computation import expressions
+
expressions.set_use_numexpr(cf.get_option(key))
-with cf.config_prefix('compute'):
- cf.register_option('use_bottleneck', True, use_bottleneck_doc,
- validator=is_bool, cb=use_bottleneck_cb)
- cf.register_option('use_numexpr', True, use_numexpr_doc,
- validator=is_bool, cb=use_numexpr_cb)
+with cf.config_prefix("compute"):
+ cf.register_option(
+ "use_bottleneck",
+ True,
+ use_bottleneck_doc,
+ validator=is_bool,
+ cb=use_bottleneck_cb,
+ )
+ cf.register_option(
+ "use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb
+ )
#
# options from the "display" namespace
@@ -284,6 +297,7 @@ def use_numexpr_cb(key):
def table_schema_cb(key):
from pandas.io.formats.printing import _enable_data_resource_formatter
+
_enable_data_resource_formatter(cf.get_option(key))
@@ -298,84 +312,117 @@ def is_terminal():
except NameError: # assume standard Python interpreter in a terminal
return True
else:
- if hasattr(ip, 'kernel'): # IPython as a Jupyter kernel
+ if hasattr(ip, "kernel"): # IPython as a Jupyter kernel
return False
else: # IPython in a terminal
return True
-with cf.config_prefix('display'):
- cf.register_option('precision', 6, pc_precision_doc, validator=is_int)
- cf.register_option('float_format', None, float_format_doc,
- validator=is_one_of_factory([None, is_callable]))
- cf.register_option('column_space', 12, validator=is_int)
- cf.register_option('max_info_rows', 1690785, pc_max_info_rows_doc,
- validator=is_instance_factory((int, type(None))))
- cf.register_option('max_rows', 60, pc_max_rows_doc,
- validator=is_instance_factory([type(None), int]))
- cf.register_option('min_rows', 10, pc_min_rows_doc,
- validator=is_instance_factory([type(None), int]))
- cf.register_option('max_categories', 8, pc_max_categories_doc,
- validator=is_int)
- cf.register_option('max_colwidth', 50, max_colwidth_doc, validator=is_int)
+with cf.config_prefix("display"):
+ cf.register_option("precision", 6, pc_precision_doc, validator=is_int)
+ cf.register_option(
+ "float_format",
+ None,
+ float_format_doc,
+ validator=is_one_of_factory([None, is_callable]),
+ )
+ cf.register_option("column_space", 12, validator=is_int)
+ cf.register_option(
+ "max_info_rows",
+ 1690785,
+ pc_max_info_rows_doc,
+ validator=is_instance_factory((int, type(None))),
+ )
+ cf.register_option(
+ "max_rows",
+ 60,
+ pc_max_rows_doc,
+ validator=is_instance_factory([type(None), int]),
+ )
+ cf.register_option(
+ "min_rows",
+ 10,
+ pc_min_rows_doc,
+ validator=is_instance_factory([type(None), int]),
+ )
+ cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int)
+ cf.register_option("max_colwidth", 50, max_colwidth_doc, validator=is_int)
if is_terminal():
max_cols = 0 # automatically determine optimal number of columns
else:
max_cols = 20 # cannot determine optimal number of columns
- cf.register_option('max_columns', max_cols, pc_max_cols_doc,
- validator=is_instance_factory([type(None), int]))
- cf.register_option('large_repr', 'truncate', pc_large_repr_doc,
- validator=is_one_of_factory(['truncate', 'info']))
- cf.register_option('max_info_columns', 100, pc_max_info_cols_doc,
- validator=is_int)
- cf.register_option('colheader_justify', 'right', colheader_justify_doc,
- validator=is_text)
- cf.register_option('notebook_repr_html', True, pc_nb_repr_h_doc,
- validator=is_bool)
- cf.register_option('pprint_nest_depth', 3, pc_pprint_nest_depth,
- validator=is_int)
- cf.register_option('multi_sparse', True, pc_multi_sparse_doc,
- validator=is_bool)
- cf.register_option('expand_frame_repr', True, pc_expand_repr_doc)
- cf.register_option('show_dimensions', 'truncate', pc_show_dimensions_doc,
- validator=is_one_of_factory([True, False, 'truncate']))
- cf.register_option('chop_threshold', None, pc_chop_threshold_doc)
- cf.register_option('max_seq_items', 100, pc_max_seq_items)
- cf.register_option('width', 80, pc_width_doc,
- validator=is_instance_factory([type(None), int]))
- cf.register_option('memory_usage', True, pc_memory_usage_doc,
- validator=is_one_of_factory([None, True,
- False, 'deep']))
- cf.register_option('unicode.east_asian_width', False,
- pc_east_asian_width_doc, validator=is_bool)
- cf.register_option('unicode.ambiguous_as_wide', False,
- pc_east_asian_width_doc, validator=is_bool)
- cf.register_option('latex.repr', False,
- pc_latex_repr_doc, validator=is_bool)
- cf.register_option('latex.escape', True, pc_latex_escape,
- validator=is_bool)
- cf.register_option('latex.longtable', False, pc_latex_longtable,
- validator=is_bool)
- cf.register_option('latex.multicolumn', True, pc_latex_multicolumn,
- validator=is_bool)
- cf.register_option('latex.multicolumn_format', 'l', pc_latex_multicolumn,
- validator=is_text)
- cf.register_option('latex.multirow', False, pc_latex_multirow,
- validator=is_bool)
- cf.register_option('html.table_schema', False, pc_table_schema_doc,
- validator=is_bool, cb=table_schema_cb)
- cf.register_option('html.border', 1, pc_html_border_doc,
- validator=is_int)
- cf.register_option('html.use_mathjax', True, pc_html_use_mathjax_doc,
- validator=is_bool)
+ cf.register_option(
+ "max_columns",
+ max_cols,
+ pc_max_cols_doc,
+ validator=is_instance_factory([type(None), int]),
+ )
+ cf.register_option(
+ "large_repr",
+ "truncate",
+ pc_large_repr_doc,
+ validator=is_one_of_factory(["truncate", "info"]),
+ )
+ cf.register_option("max_info_columns", 100, pc_max_info_cols_doc, validator=is_int)
+ cf.register_option(
+ "colheader_justify", "right", colheader_justify_doc, validator=is_text
+ )
+ cf.register_option("notebook_repr_html", True, pc_nb_repr_h_doc, validator=is_bool)
+ cf.register_option("pprint_nest_depth", 3, pc_pprint_nest_depth, validator=is_int)
+ cf.register_option("multi_sparse", True, pc_multi_sparse_doc, validator=is_bool)
+ cf.register_option("expand_frame_repr", True, pc_expand_repr_doc)
+ cf.register_option(
+ "show_dimensions",
+ "truncate",
+ pc_show_dimensions_doc,
+ validator=is_one_of_factory([True, False, "truncate"]),
+ )
+ cf.register_option("chop_threshold", None, pc_chop_threshold_doc)
+ cf.register_option("max_seq_items", 100, pc_max_seq_items)
+ cf.register_option(
+ "width", 80, pc_width_doc, validator=is_instance_factory([type(None), int])
+ )
+ cf.register_option(
+ "memory_usage",
+ True,
+ pc_memory_usage_doc,
+ validator=is_one_of_factory([None, True, False, "deep"]),
+ )
+ cf.register_option(
+ "unicode.east_asian_width", False, pc_east_asian_width_doc, validator=is_bool
+ )
+ cf.register_option(
+ "unicode.ambiguous_as_wide", False, pc_east_asian_width_doc, validator=is_bool
+ )
+ cf.register_option("latex.repr", False, pc_latex_repr_doc, validator=is_bool)
+ cf.register_option("latex.escape", True, pc_latex_escape, validator=is_bool)
+ cf.register_option("latex.longtable", False, pc_latex_longtable, validator=is_bool)
+ cf.register_option(
+ "latex.multicolumn", True, pc_latex_multicolumn, validator=is_bool
+ )
+ cf.register_option(
+ "latex.multicolumn_format", "l", pc_latex_multicolumn, validator=is_text
+ )
+ cf.register_option("latex.multirow", False, pc_latex_multirow, validator=is_bool)
+ cf.register_option(
+ "html.table_schema",
+ False,
+ pc_table_schema_doc,
+ validator=is_bool,
+ cb=table_schema_cb,
+ )
+ cf.register_option("html.border", 1, pc_html_border_doc, validator=is_int)
+ cf.register_option(
+ "html.use_mathjax", True, pc_html_use_mathjax_doc, validator=is_bool
+ )
tc_sim_interactive_doc = """
: boolean
Whether to simulate interactive mode for purposes of testing
"""
-with cf.config_prefix('mode'):
- cf.register_option('sim_interactive', False, tc_sim_interactive_doc)
+with cf.config_prefix("mode"):
+ cf.register_option("sim_interactive", False, tc_sim_interactive_doc)
use_inf_as_null_doc = """
: boolean
@@ -396,17 +443,19 @@ def is_terminal():
def use_inf_as_na_cb(key):
from pandas.core.dtypes.missing import _use_inf_as_na
+
_use_inf_as_na(key)
-with cf.config_prefix('mode'):
- cf.register_option('use_inf_as_na', False, use_inf_as_na_doc,
- cb=use_inf_as_na_cb)
- cf.register_option('use_inf_as_null', False, use_inf_as_null_doc,
- cb=use_inf_as_na_cb)
+with cf.config_prefix("mode"):
+ cf.register_option("use_inf_as_na", False, use_inf_as_na_doc, cb=use_inf_as_na_cb)
+ cf.register_option(
+ "use_inf_as_null", False, use_inf_as_null_doc, cb=use_inf_as_na_cb
+ )
-cf.deprecate_option('mode.use_inf_as_null', msg=use_inf_as_null_doc,
- rkey='mode.use_inf_as_na')
+cf.deprecate_option(
+ "mode.use_inf_as_null", msg=use_inf_as_null_doc, rkey="mode.use_inf_as_na"
+)
# user warnings
@@ -416,9 +465,13 @@ def use_inf_as_na_cb(key):
The default is warn
"""
-with cf.config_prefix('mode'):
- cf.register_option('chained_assignment', 'warn', chained_assignment,
- validator=is_one_of_factory([None, 'warn', 'raise']))
+with cf.config_prefix("mode"):
+ cf.register_option(
+ "chained_assignment",
+ "warn",
+ chained_assignment,
+ validator=is_one_of_factory([None, "warn", "raise"]),
+ )
# Set up the io.excel specific reader configuration.
@@ -428,41 +481,45 @@ def use_inf_as_na_cb(key):
auto, {others}.
"""
-_xls_options = ['xlrd']
-_xlsm_options = ['xlrd', 'openpyxl']
-_xlsx_options = ['xlrd', 'openpyxl']
-_ods_options = ['odf']
+_xls_options = ["xlrd"]
+_xlsm_options = ["xlrd", "openpyxl"]
+_xlsx_options = ["xlrd", "openpyxl"]
+_ods_options = ["odf"]
with cf.config_prefix("io.excel.xls"):
- cf.register_option("reader", "auto",
- reader_engine_doc.format(
- ext='xls',
- others=', '.join(_xls_options)),
- validator=str)
+ cf.register_option(
+ "reader",
+ "auto",
+ reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)),
+ validator=str,
+ )
with cf.config_prefix("io.excel.xlsm"):
- cf.register_option("reader", "auto",
- reader_engine_doc.format(
- ext='xlsm',
- others=', '.join(_xlsm_options)),
- validator=str)
+ cf.register_option(
+ "reader",
+ "auto",
+ reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
+ validator=str,
+ )
with cf.config_prefix("io.excel.xlsx"):
- cf.register_option("reader", "auto",
- reader_engine_doc.format(
- ext='xlsx',
- others=', '.join(_xlsx_options)),
- validator=str)
+ cf.register_option(
+ "reader",
+ "auto",
+ reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
+ validator=str,
+ )
with cf.config_prefix("io.excel.ods"):
- cf.register_option("reader", "auto",
- reader_engine_doc.format(
- ext='ods',
- others=', '.join(_ods_options)),
- validator=str)
+ cf.register_option(
+ "reader",
+ "auto",
+ reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
+ validator=str,
+ )
# Set up the io.excel specific writer configuration.
@@ -472,32 +529,35 @@ def use_inf_as_na_cb(key):
auto, {others}.
"""
-_xls_options = ['xlwt']
-_xlsm_options = ['openpyxl']
-_xlsx_options = ['openpyxl', 'xlsxwriter']
+_xls_options = ["xlwt"]
+_xlsm_options = ["openpyxl"]
+_xlsx_options = ["openpyxl", "xlsxwriter"]
with cf.config_prefix("io.excel.xls"):
- cf.register_option("writer", "auto",
- writer_engine_doc.format(
- ext='xls',
- others=', '.join(_xls_options)),
- validator=str)
+ cf.register_option(
+ "writer",
+ "auto",
+ writer_engine_doc.format(ext="xls", others=", ".join(_xls_options)),
+ validator=str,
+ )
with cf.config_prefix("io.excel.xlsm"):
- cf.register_option("writer", "auto",
- writer_engine_doc.format(
- ext='xlsm',
- others=', '.join(_xlsm_options)),
- validator=str)
+ cf.register_option(
+ "writer",
+ "auto",
+ writer_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
+ validator=str,
+ )
with cf.config_prefix("io.excel.xlsx"):
- cf.register_option("writer", "auto",
- writer_engine_doc.format(
- ext='xlsx',
- others=', '.join(_xlsx_options)),
- validator=str)
+ cf.register_option(
+ "writer",
+ "auto",
+ writer_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
+ validator=str,
+ )
# Set up the io.parquet specific configuration.
@@ -507,10 +567,13 @@ def use_inf_as_na_cb(key):
'auto', 'pyarrow', 'fastparquet', the default is 'auto'
"""
-with cf.config_prefix('io.parquet'):
+with cf.config_prefix("io.parquet"):
cf.register_option(
- 'engine', 'auto', parquet_engine_doc,
- validator=is_one_of_factory(['auto', 'pyarrow', 'fastparquet']))
+ "engine",
+ "auto",
+ parquet_engine_doc,
+ validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]),
+ )
# --------
# Plotting
@@ -526,28 +589,35 @@ def use_inf_as_na_cb(key):
def register_plotting_backend_cb(key):
backend_str = cf.get_option(key)
- if backend_str == 'matplotlib':
+ if backend_str == "matplotlib":
try:
import pandas.plotting._matplotlib # noqa
except ImportError:
- raise ImportError('matplotlib is required for plotting when the '
- 'default backend "matplotlib" is selected.')
+ raise ImportError(
+ "matplotlib is required for plotting when the "
+ 'default backend "matplotlib" is selected.'
+ )
else:
return
try:
importlib.import_module(backend_str)
except ImportError:
- raise ValueError('"{}" does not seem to be an installed module. '
- 'A pandas plotting backend must be a module that '
- 'can be imported'.format(backend_str))
+ raise ValueError(
+ '"{}" does not seem to be an installed module. '
+ "A pandas plotting backend must be a module that "
+ "can be imported".format(backend_str)
+ )
-with cf.config_prefix('plotting'):
- cf.register_option('backend', defval='matplotlib',
- doc=plotting_backend_doc,
- validator=str,
- cb=register_plotting_backend_cb)
+with cf.config_prefix("plotting"):
+ cf.register_option(
+ "backend",
+ defval="matplotlib",
+ doc=plotting_backend_doc,
+ validator=str,
+ cb=register_plotting_backend_cb,
+ )
register_converter_doc = """
@@ -569,5 +639,10 @@ def register_converter_cb(key):
with cf.config_prefix("plotting.matplotlib"):
- cf.register_option("register_converters", True, register_converter_doc,
- validator=bool, cb=register_converter_cb)
+ cf.register_option(
+ "register_converters",
+ True,
+ register_converter_doc,
+ validator=bool,
+ cb=register_converter_cb,
+ )
diff --git a/pandas/core/dtypes/api.py b/pandas/core/dtypes/api.py
index e9d7b9c4281bd..2b527e1fb5890 100644
--- a/pandas/core/dtypes/api.py
+++ b/pandas/core/dtypes/api.py
@@ -1,14 +1,47 @@
# flake8: noqa
from .common import (
- is_array_like, is_bool, is_bool_dtype, is_categorical,
- is_categorical_dtype, is_complex, is_complex_dtype,
- is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype,
- is_datetime64tz_dtype, is_datetimetz, is_dict_like, is_dtype_equal,
- is_extension_array_dtype, is_extension_type, is_file_like, is_float,
- is_float_dtype, is_hashable, is_int64_dtype, is_integer, is_integer_dtype,
- is_interval, is_interval_dtype, is_iterator, is_list_like, is_named_tuple,
- is_number, is_numeric_dtype, is_object_dtype, is_period, is_period_dtype,
- is_re, is_re_compilable, is_scalar, is_signed_integer_dtype, is_sparse,
- is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype,
- is_unsigned_integer_dtype, pandas_dtype)
+ is_array_like,
+ is_bool,
+ is_bool_dtype,
+ is_categorical,
+ is_categorical_dtype,
+ is_complex,
+ is_complex_dtype,
+ is_datetime64_any_dtype,
+ is_datetime64_dtype,
+ is_datetime64_ns_dtype,
+ is_datetime64tz_dtype,
+ is_datetimetz,
+ is_dict_like,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_extension_type,
+ is_file_like,
+ is_float,
+ is_float_dtype,
+ is_hashable,
+ is_int64_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_interval,
+ is_interval_dtype,
+ is_iterator,
+ is_list_like,
+ is_named_tuple,
+ is_number,
+ is_numeric_dtype,
+ is_object_dtype,
+ is_period,
+ is_period_dtype,
+ is_re,
+ is_re_compilable,
+ is_scalar,
+ is_signed_integer_dtype,
+ is_sparse,
+ is_string_dtype,
+ is_timedelta64_dtype,
+ is_timedelta64_ns_dtype,
+ is_unsigned_integer_dtype,
+ pandas_dtype,
+)
diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
index e7191136a7d53..59ef17e3d121f 100644
--- a/pandas/core/dtypes/base.py
+++ b/pandas/core/dtypes/base.py
@@ -68,6 +68,7 @@ class property**.
``pandas.errors.AbstractMethodError`` and no ``register`` method is
provided for registering virtual subclasses.
"""
+
_metadata = () # type: Tuple[str, ...]
def __str__(self):
@@ -98,8 +99,7 @@ def __eq__(self, other):
return False
if isinstance(other, type(self)):
return all(
- getattr(self, attr) == getattr(other, attr)
- for attr in self._metadata
+ getattr(self, attr) == getattr(other, attr) for attr in self._metadata
)
return False
@@ -146,7 +146,7 @@ def kind(self) -> str:
--------
numpy.dtype.kind
"""
- return 'O'
+ return "O"
@property
def name(self) -> str:
@@ -223,8 +223,9 @@ def construct_from_string(cls, string: str):
if not isinstance(string, str):
raise TypeError("Expects a string, got {}".format(type(string)))
if string != cls.name:
- raise TypeError("Cannot construct a '{}' from '{}'".format(
- cls.__name__, string))
+ raise TypeError(
+ "Cannot construct a '{}' from '{}'".format(cls.__name__, string)
+ )
return cls()
@classmethod
@@ -250,10 +251,9 @@ def is_dtype(cls, dtype) -> bool:
3. ``dtype`` has a ``dtype`` attribute, and any of the above
conditions is true for ``dtype.dtype``.
"""
- dtype = getattr(dtype, 'dtype', dtype)
+ dtype = getattr(dtype, "dtype", dtype)
- if isinstance(dtype, (ABCSeries, ABCIndexClass,
- ABCDataFrame, np.dtype)):
+ if isinstance(dtype, (ABCSeries, ABCIndexClass, ABCDataFrame, np.dtype)):
# https://github.com/pandas-dev/pandas/issues/22960
# avoid passing data to `construct_from_string`. This could
# cause a FutureWarning from numpy about failing elementwise
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index c68d469d291e7..f483cf520754b 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -8,19 +8,49 @@
from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT
from .common import (
- _INT64_DTYPE, _NS_DTYPE, _POSSIBLY_CAST_DTYPES, _TD_DTYPE, ensure_int8,
- ensure_int16, ensure_int32, ensure_int64, ensure_object, ensure_str,
- is_bool, is_bool_dtype, is_categorical_dtype, is_complex, is_complex_dtype,
- is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype,
- is_datetime_or_timedelta_dtype, is_datetimelike, is_dtype_equal,
- is_extension_array_dtype, is_extension_type, is_float, is_float_dtype,
- is_integer, is_integer_dtype, is_object_dtype, is_scalar, is_string_dtype,
- is_timedelta64_dtype, is_timedelta64_ns_dtype, is_unsigned_integer_dtype,
- pandas_dtype)
+ _INT64_DTYPE,
+ _NS_DTYPE,
+ _POSSIBLY_CAST_DTYPES,
+ _TD_DTYPE,
+ ensure_int8,
+ ensure_int16,
+ ensure_int32,
+ ensure_int64,
+ ensure_object,
+ ensure_str,
+ is_bool,
+ is_bool_dtype,
+ is_categorical_dtype,
+ is_complex,
+ is_complex_dtype,
+ is_datetime64_dtype,
+ is_datetime64_ns_dtype,
+ is_datetime64tz_dtype,
+ is_datetime_or_timedelta_dtype,
+ is_datetimelike,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_extension_type,
+ is_float,
+ is_float_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_object_dtype,
+ is_scalar,
+ is_string_dtype,
+ is_timedelta64_dtype,
+ is_timedelta64_ns_dtype,
+ is_unsigned_integer_dtype,
+ pandas_dtype,
+)
from .dtypes import DatetimeTZDtype, ExtensionDtype, PeriodDtype
from .generic import (
- ABCDatetimeArray, ABCDatetimeIndex, ABCPeriodArray, ABCPeriodIndex,
- ABCSeries)
+ ABCDatetimeArray,
+ ABCDatetimeIndex,
+ ABCPeriodArray,
+ ABCPeriodIndex,
+ ABCSeries,
+)
from .inference import is_list_like
from .missing import isna, notna
@@ -35,8 +65,8 @@ def maybe_convert_platform(values):
if isinstance(values, (list, tuple, range)):
values = construct_1d_object_array_from_listlike(values)
- if getattr(values, 'dtype', None) == np.object_:
- if hasattr(values, '_values'):
+ if getattr(values, "dtype", None) == np.object_:
+ if hasattr(values, "_values"):
values = values._values
values = lib.maybe_convert_objects(values)
@@ -72,27 +102,27 @@ def trans(x):
return x
if isinstance(dtype, str):
- if dtype == 'infer':
- inferred_type = lib.infer_dtype(ensure_object(result.ravel()),
- skipna=False)
- if inferred_type == 'boolean':
- dtype = 'bool'
- elif inferred_type == 'integer':
- dtype = 'int64'
- elif inferred_type == 'datetime64':
- dtype = 'datetime64[ns]'
- elif inferred_type == 'timedelta64':
- dtype = 'timedelta64[ns]'
+ if dtype == "infer":
+ inferred_type = lib.infer_dtype(ensure_object(result.ravel()), skipna=False)
+ if inferred_type == "boolean":
+ dtype = "bool"
+ elif inferred_type == "integer":
+ dtype = "int64"
+ elif inferred_type == "datetime64":
+ dtype = "datetime64[ns]"
+ elif inferred_type == "timedelta64":
+ dtype = "timedelta64[ns]"
# try to upcast here
- elif inferred_type == 'floating':
- dtype = 'int64'
+ elif inferred_type == "floating":
+ dtype = "int64"
if issubclass(result.dtype.type, np.number):
def trans(x): # noqa
return x.round()
+
else:
- dtype = 'object'
+ dtype = "object"
if isinstance(dtype, str):
dtype = np.dtype(dtype)
@@ -101,8 +131,7 @@ def trans(x): # noqa
# don't allow upcasts here (except if empty)
if dtype.kind == result.dtype.kind:
- if (result.dtype.itemsize <= dtype.itemsize and
- np.prod(result.shape)):
+ if result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape):
return result
if is_bool_dtype(dtype) or is_integer_dtype(dtype):
@@ -116,17 +145,21 @@ def trans(x): # noqa
arr = np.array([r[0]])
# if we have any nulls, then we are done
- if (isna(arr).any() or
- not np.allclose(arr, trans(arr).astype(dtype), rtol=0)):
+ if isna(arr).any() or not np.allclose(
+ arr, trans(arr).astype(dtype), rtol=0
+ ):
return result
# a comparable, e.g. a Decimal may slip in here
- elif not isinstance(r[0], (np.integer, np.floating, np.bool, int,
- float, bool)):
+ elif not isinstance(
+ r[0], (np.integer, np.floating, np.bool, int, float, bool)
+ ):
return result
- if (issubclass(result.dtype.type, (np.object_, np.number)) and
- notna(result).all()):
+ if (
+ issubclass(result.dtype.type, (np.object_, np.number))
+ and notna(result).all()
+ ):
new_result = trans(result).astype(dtype)
try:
if np.allclose(new_result, result, rtol=0):
@@ -137,20 +170,20 @@ def trans(x): # noqa
# hit here
if (new_result == result).all():
return new_result
- elif (issubclass(dtype.type, np.floating) and
- not is_bool_dtype(result.dtype)):
+ elif issubclass(dtype.type, np.floating) and not is_bool_dtype(result.dtype):
return result.astype(dtype)
# a datetimelike
# GH12821, iNaT is casted to float
- elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i', 'f']:
+ elif dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]:
try:
result = result.astype(dtype)
except Exception:
if dtype.tz:
# convert to datetime and change timezone
from pandas import to_datetime
- result = to_datetime(result).tz_localize('utc')
+
+ result = to_datetime(result).tz_localize("utc")
result = result.tz_convert(dtype.tz)
elif dtype.type == Period:
@@ -206,7 +239,7 @@ def maybe_upcast_putmask(result, mask, other):
if is_datetimelike(result.dtype):
if is_scalar(other):
if isna(other):
- other = result.dtype.type('nat')
+ other = result.dtype.type("nat")
elif is_integer(other):
other = np.array(other, dtype=result.dtype)
elif is_integer_dtype(other):
@@ -244,8 +277,7 @@ def changeit():
# we have a scalar or len 0 ndarray
# and its nan and we are changing some values
- if (is_scalar(other) or
- (isinstance(other, np.ndarray) and other.ndim < 1)):
+ if is_scalar(other) or (isinstance(other, np.ndarray) and other.ndim < 1):
if isna(other):
return changeit()
@@ -385,10 +417,10 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
elif isinstance(val, (np.datetime64, datetime)):
val = tslibs.Timestamp(val)
if val is tslibs.NaT or val.tz is None:
- dtype = np.dtype('M8[ns]')
+ dtype = np.dtype("M8[ns]")
else:
if pandas_dtype:
- dtype = DatetimeTZDtype(unit='ns', tz=val.tz)
+ dtype = DatetimeTZDtype(unit="ns", tz=val.tz)
else:
# return datetimetz as object
return np.object_, val
@@ -396,7 +428,7 @@ def infer_dtype_from_scalar(val, pandas_dtype=False):
elif isinstance(val, (np.timedelta64, timedelta)):
val = tslibs.Timedelta(val).value
- dtype = np.dtype('m8[ns]')
+ dtype = np.dtype("m8[ns]")
elif is_bool(val):
dtype = np.bool_
@@ -473,8 +505,7 @@ def infer_dtype_from_array(arr, pandas_dtype=False):
# don't force numpy coerce with nan's
inferred = lib.infer_dtype(arr, skipna=False)
- if inferred in ['string', 'bytes', 'unicode',
- 'mixed', 'mixed-integer']:
+ if inferred in ["string", "bytes", "unicode", "mixed", "mixed-integer"]:
return (np.object_, arr)
arr = np.asarray(arr)
@@ -506,7 +537,7 @@ def maybe_infer_dtype_type(element):
numpy.int64
"""
tipo = None
- if hasattr(element, 'dtype'):
+ if hasattr(element, "dtype"):
tipo = element.dtype
elif is_list_like(element):
element = np.asarray(element)
@@ -547,15 +578,16 @@ def maybe_cast_item(obj, item, dtype):
if dtype in (np.object_, np.bool_):
obj[item] = chunk.astype(np.object_)
elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover
- raise ValueError("Unexpected dtype encountered: {dtype}"
- .format(dtype=dtype))
+ raise ValueError(
+ "Unexpected dtype encountered: {dtype}".format(dtype=dtype)
+ )
def invalidate_string_dtypes(dtype_set):
"""Change string like dtypes to object for
``DataFrame.select_dtypes()``.
"""
- non_string_dtypes = dtype_set - {np.dtype('S').type, np.dtype(' 1 and coerce:
- raise ValueError("Only one of 'datetime', 'numeric' or "
- "'timedelta' can be True when when coerce=True.")
+ raise ValueError(
+ "Only one of 'datetime', 'numeric' or "
+ "'timedelta' can be True when when coerce=True."
+ )
if isinstance(values, (list, tuple)):
# List or scalar
values = np.array(values, dtype=np.object_)
- elif not hasattr(values, 'dtype'):
+ elif not hasattr(values, "dtype"):
values = np.array([values], dtype=np.object_)
elif not is_object_dtype(values.dtype):
# If not object, do not attempt conversion
@@ -798,21 +839,23 @@ def soft_convert_objects(values, datetime=True, numeric=True, timedelta=True,
# Immediate return if coerce
if datetime:
from pandas import to_datetime
- return to_datetime(values, errors='coerce').to_numpy()
+
+ return to_datetime(values, errors="coerce").to_numpy()
elif timedelta:
from pandas import to_timedelta
- return to_timedelta(values, errors='coerce').to_numpy()
+
+ return to_timedelta(values, errors="coerce").to_numpy()
elif numeric:
from pandas import to_numeric
- return to_numeric(values, errors='coerce')
+
+ return to_numeric(values, errors="coerce")
# Soft conversions
if datetime:
# GH 20380, when datetime is beyond year 2262, hence outside
# bound of nanosecond-resolution 64-bit integers.
try:
- values = lib.maybe_convert_objects(values,
- convert_datetime=datetime)
+ values = lib.maybe_convert_objects(values, convert_datetime=datetime)
except OutOfBoundsDatetime:
pass
@@ -822,8 +865,7 @@ def soft_convert_objects(values, datetime=True, numeric=True, timedelta=True,
if numeric and is_object_dtype(values.dtype):
try:
- converted = lib.maybe_convert_numeric(values, set(),
- coerce_numeric=True)
+ converted = lib.maybe_convert_numeric(values, set(), coerce_numeric=True)
# If all NaNs, then do not-alter
values = converted if not isna(converted).all() else values
values = values.copy() if copy else values
@@ -839,9 +881,9 @@ def maybe_castable(arr):
# check datetime64[ns]/timedelta64[ns] are valid
# otherwise try to coerce
kind = arr.dtype.kind
- if kind == 'M':
+ if kind == "M":
return is_datetime64_ns_dtype(arr.dtype)
- elif kind == 'm':
+ elif kind == "m":
return is_timedelta64_ns_dtype(arr.dtype)
return arr.dtype.name not in _POSSIBLY_CAST_DTYPES
@@ -866,8 +908,9 @@ def maybe_infer_to_datetimelike(value, convert_dates=False):
"""
# TODO: why not timedelta?
- if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex,
- ABCDatetimeArray, ABCPeriodArray)):
+ if isinstance(
+ value, (ABCDatetimeIndex, ABCPeriodIndex, ABCDatetimeArray, ABCPeriodArray)
+ ):
return value
elif isinstance(value, ABCSeries):
if isinstance(value._values, ABCDatetimeIndex):
@@ -894,9 +937,7 @@ def try_datetime(v):
# safe coerce to datetime64
try:
# GH19671
- v = tslib.array_to_datetime(v,
- require_iso8601=True,
- errors='raise')[0]
+ v = tslib.array_to_datetime(v, require_iso8601=True, errors="raise")[0]
except ValueError:
# we might have a sequence of the same-datetimes with tz's
@@ -907,8 +948,7 @@ def try_datetime(v):
from pandas import DatetimeIndex
values, tz = conversion.datetime_to_datetime64(v)
- return DatetimeIndex(values).tz_localize(
- 'UTC').tz_convert(tz=tz)
+ return DatetimeIndex(values).tz_localize("UTC").tz_convert(tz=tz)
except (ValueError, TypeError):
pass
@@ -922,6 +962,7 @@ def try_timedelta(v):
# will try first with a string & object conversion
from pandas import to_timedelta
+
try:
return to_timedelta(v)._ndarray_values.reshape(shape)
except Exception:
@@ -929,13 +970,13 @@ def try_timedelta(v):
inferred_type = lib.infer_datetimelike_array(ensure_object(v))
- if inferred_type == 'date' and convert_dates:
+ if inferred_type == "date" and convert_dates:
value = try_datetime(v)
- elif inferred_type == 'datetime':
+ elif inferred_type == "datetime":
value = try_datetime(v)
- elif inferred_type == 'timedelta':
+ elif inferred_type == "timedelta":
value = try_timedelta(v)
- elif inferred_type == 'nat':
+ elif inferred_type == "nat":
# if all NaT, return as datetime
if isna(v).all():
@@ -946,7 +987,7 @@ def try_timedelta(v):
# try timedelta first to avoid spurious datetime conversions
# e.g. '00:00:01' is a timedelta but technically is also a datetime
value = try_timedelta(v)
- if lib.infer_dtype(value, skipna=False) in ['mixed']:
+ if lib.infer_dtype(value, skipna=False) in ["mixed"]:
# cannot skip missing values, as NaT implies that the string
# is actually a datetime
value = try_datetime(v)
@@ -954,7 +995,7 @@ def try_timedelta(v):
return value
-def maybe_cast_to_datetime(value, dtype, errors='raise'):
+def maybe_cast_to_datetime(value, dtype, errors="raise"):
""" try to cast the array/value to a datetimelike dtype, converting float
nan to iNaT
"""
@@ -972,17 +1013,21 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'):
if is_datetime64 or is_datetime64tz or is_timedelta64:
# Force the dtype if needed.
- msg = ("The '{dtype}' dtype has no unit. "
- "Please pass in '{dtype}[ns]' instead.")
+ msg = (
+ "The '{dtype}' dtype has no unit. "
+ "Please pass in '{dtype}[ns]' instead."
+ )
if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE):
- if dtype.name in ('datetime64', 'datetime64[ns]'):
- if dtype.name == 'datetime64':
+ if dtype.name in ("datetime64", "datetime64[ns]"):
+ if dtype.name == "datetime64":
raise ValueError(msg.format(dtype=dtype.name))
dtype = _NS_DTYPE
else:
- raise TypeError("cannot convert datetimelike to "
- "dtype [{dtype}]".format(dtype=dtype))
+ raise TypeError(
+ "cannot convert datetimelike to "
+ "dtype [{dtype}]".format(dtype=dtype)
+ )
elif is_datetime64tz:
# our NaT doesn't support tz's
@@ -992,13 +1037,15 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'):
value = [value]
elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE):
- if dtype.name in ('timedelta64', 'timedelta64[ns]'):
- if dtype.name == 'timedelta64':
+ if dtype.name in ("timedelta64", "timedelta64[ns]"):
+ if dtype.name == "timedelta64":
raise ValueError(msg.format(dtype=dtype.name))
dtype = _TD_DTYPE
else:
- raise TypeError("cannot convert timedeltalike to "
- "dtype [{dtype}]".format(dtype=dtype))
+ raise TypeError(
+ "cannot convert timedeltalike to "
+ "dtype [{dtype}]".format(dtype=dtype)
+ )
if is_scalar(value):
if value == iNaT or isna(value):
@@ -1011,8 +1058,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'):
value = iNaT
# we have an array of datetime or timedeltas & nulls
- elif np.prod(value.shape) or not is_dtype_equal(value.dtype,
- dtype):
+ elif np.prod(value.shape) or not is_dtype_equal(value.dtype, dtype):
try:
if is_datetime64:
value = to_datetime(value, errors=errors)
@@ -1034,8 +1080,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'):
else:
# Numeric values are UTC at this point,
# so localize and convert
- value = (value.tz_localize('UTC')
- .tz_convert(dtype.tz))
+ value = value.tz_localize("UTC").tz_convert(dtype.tz)
elif is_timedelta64:
value = to_timedelta(value, errors=errors)._values
except OutOfBoundsDatetime:
@@ -1048,12 +1093,11 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'):
if is_object_dtype(dtype):
if value.dtype != _NS_DTYPE:
value = value.astype(_NS_DTYPE)
- ints = np.asarray(value).view('i8')
+ ints = np.asarray(value).view("i8")
return tslib.ints_to_pydatetime(ints)
# we have a non-castable dtype that was passed
- raise TypeError('Cannot cast datetime64 to {dtype}'
- .format(dtype=dtype))
+ raise TypeError("Cannot cast datetime64 to {dtype}".format(dtype=dtype))
else:
@@ -1061,20 +1105,24 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'):
# catch a datetime/timedelta that is not of ns variety
# and no coercion specified
- if is_array and value.dtype.kind in ['M', 'm']:
+ if is_array and value.dtype.kind in ["M", "m"]:
dtype = value.dtype
- if dtype.kind == 'M' and dtype != _NS_DTYPE:
+ if dtype.kind == "M" and dtype != _NS_DTYPE:
value = tslibs.conversion.ensure_datetime64ns(value)
- elif dtype.kind == 'm' and dtype != _TD_DTYPE:
+ elif dtype.kind == "m" and dtype != _TD_DTYPE:
value = to_timedelta(value)
# only do this if we have an array and the dtype of the array is not
# setup already we are not an integer/object, so don't bother with this
# conversion
- elif not (is_array and not (issubclass(value.dtype.type, np.integer) or
- value.dtype == np.object_)):
+ elif not (
+ is_array
+ and not (
+ issubclass(value.dtype.type, np.integer) or value.dtype == np.object_
+ )
+ ):
value = maybe_infer_to_datetimelike(value)
return value
@@ -1099,7 +1147,7 @@ def find_common_type(types):
"""
if len(types) == 0:
- raise ValueError('no types given')
+ raise ValueError("no types given")
first = types[0]
@@ -1113,9 +1161,9 @@ def find_common_type(types):
# take lowest unit
if all(is_datetime64_dtype(t) for t in types):
- return np.dtype('datetime64[ns]')
+ return np.dtype("datetime64[ns]")
if all(is_timedelta64_dtype(t) for t in types):
- return np.dtype('timedelta64[ns]')
+ return np.dtype("timedelta64[ns]")
# don't mix bool / int or float or complex
# this is different from numpy, which casts bool with float/int as int
@@ -1174,9 +1222,11 @@ def construct_1d_arraylike_from_scalar(value, length, dtype):
"""
if is_datetime64tz_dtype(dtype):
from pandas import DatetimeIndex
+
subarr = DatetimeIndex([value] * length, dtype=dtype)
elif is_categorical_dtype(dtype):
from pandas import Categorical
+
subarr = Categorical([value] * length, dtype=dtype)
else:
if not isinstance(dtype, (np.dtype, type(np.dtype))):
@@ -1184,7 +1234,7 @@ def construct_1d_arraylike_from_scalar(value, length, dtype):
if length and is_integer_dtype(dtype) and isna(value):
# coerce if we have nan for an integer dtype
- dtype = np.dtype('float64')
+ dtype = np.dtype("float64")
elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):
# we need to coerce to object dtype to avoid
# to allow numpy to take our string as a scalar value
@@ -1218,7 +1268,7 @@ def construct_1d_object_array_from_listlike(values):
"""
# numpy will try to interpret nested lists as further dimensions, hence
# making a 1D array that contains list-likes is a bit tricky:
- result = np.empty(len(values), dtype='object')
+ result = np.empty(len(values), dtype="object")
result[:] = values
return result
@@ -1314,8 +1364,10 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False):
else:
casted = arr.astype(dtype, copy=copy)
except OverflowError:
- raise OverflowError("The elements provided in the data cannot all be "
- "casted to the dtype {dtype}".format(dtype=dtype))
+ raise OverflowError(
+ "The elements provided in the data cannot all be "
+ "casted to the dtype {dtype}".format(dtype=dtype)
+ )
if np.array_equal(arr, casted):
return casted
@@ -1328,9 +1380,7 @@ def maybe_cast_to_integer_array(arr, dtype, copy=False):
arr = np.asarray(arr)
if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
- raise OverflowError("Trying to coerce negative values "
- "to unsigned integers")
+ raise OverflowError("Trying to coerce negative values " "to unsigned integers")
- if is_integer_dtype(dtype) and (is_float_dtype(arr) or
- is_object_dtype(arr)):
+ if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)):
raise ValueError("Trying to coerce float values to integers")
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index b2b74e2a70ca9..d0e4bd9b4482a 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -9,22 +9,61 @@
from pandas.compat import PY36
from pandas.core.dtypes.dtypes import (
- CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype,
- PeriodDtype, registry)
+ CategoricalDtype,
+ DatetimeTZDtype,
+ ExtensionDtype,
+ IntervalDtype,
+ PeriodDtype,
+ registry,
+)
from pandas.core.dtypes.generic import (
- ABCCategorical, ABCDateOffset, ABCDatetimeIndex, ABCIndexClass,
- ABCPeriodArray, ABCPeriodIndex, ABCSeries)
+ ABCCategorical,
+ ABCDateOffset,
+ ABCDatetimeIndex,
+ ABCIndexClass,
+ ABCPeriodArray,
+ ABCPeriodIndex,
+ ABCSeries,
+)
from pandas.core.dtypes.inference import ( # noqa:F401
- is_array_like, is_bool, is_complex, is_decimal, is_dict_like, is_file_like,
- is_float, is_hashable, is_integer, is_interval, is_iterator, is_list_like,
- is_named_tuple, is_nested_list_like, is_number, is_re, is_re_compilable,
- is_scalar, is_sequence, is_string_like)
+ is_array_like,
+ is_bool,
+ is_complex,
+ is_decimal,
+ is_dict_like,
+ is_file_like,
+ is_float,
+ is_hashable,
+ is_integer,
+ is_interval,
+ is_iterator,
+ is_list_like,
+ is_named_tuple,
+ is_nested_list_like,
+ is_number,
+ is_re,
+ is_re_compilable,
+ is_scalar,
+ is_sequence,
+ is_string_like,
+)
from pandas._typing import ArrayLike
-_POSSIBLY_CAST_DTYPES = {np.dtype(t).name
- for t in ['O', 'int8', 'uint8', 'int16', 'uint16',
- 'int32', 'uint32', 'int64', 'uint64']}
+_POSSIBLY_CAST_DTYPES = {
+ np.dtype(t).name
+ for t in [
+ "O",
+ "int8",
+ "uint8",
+ "int16",
+ "uint16",
+ "int32",
+ "uint32",
+ "int64",
+ "uint64",
+ ]
+}
_NS_DTYPE = conversion.NS_DTYPE
_TD_DTYPE = conversion.TD_DTYPE
@@ -74,7 +113,7 @@ def ensure_str(value: Union[bytes, Any]) -> str:
Ensure that bytes and non-strings get converted into ``str`` objects.
"""
if isinstance(value, bytes):
- value = value.decode('utf-8')
+ value = value.decode("utf-8")
elif not isinstance(value, str):
value = str(value)
return value
@@ -97,6 +136,7 @@ def ensure_categorical(arr):
if not is_categorical(arr):
from pandas import Categorical
+
arr = Categorical(arr)
return arr
@@ -128,13 +168,13 @@ def ensure_int_or_float(arr: ArrayLike, copy=False) -> np.array:
will remain unchanged.
"""
try:
- return arr.astype('int64', copy=copy, casting='safe')
+ return arr.astype("int64", copy=copy, casting="safe")
except TypeError:
pass
try:
- return arr.astype('uint64', copy=copy, casting='safe')
+ return arr.astype("uint64", copy=copy, casting="safe")
except TypeError:
- return arr.astype('float64', copy=copy)
+ return arr.astype("float64", copy=copy)
def ensure_python_int(value: Union[int, np.integer]) -> int:
@@ -154,12 +194,13 @@ def ensure_python_int(value: Union[int, np.integer]) -> int:
TypeError: if the value isn't an int or can't be converted to one.
"""
if not is_scalar(value):
- raise TypeError("Value needs to be a scalar value, was type {}"
- .format(type(value)))
+ raise TypeError(
+ "Value needs to be a scalar value, was type {}".format(type(value))
+ )
msg = "Wrong type {} for value {}"
try:
new_value = int(value)
- assert (new_value == value)
+ assert new_value == value
except (TypeError, ValueError, AssertionError):
raise TypeError(msg.format(type(value), value))
return new_value
@@ -175,8 +216,10 @@ def classes_and_not_datetimelike(*klasses):
evaluate if the tipo is a subclass of the klasses
and not a datetimelike
"""
- return lambda tipo: (issubclass(tipo, klasses) and
- not issubclass(tipo, (np.datetime64, np.timedelta64)))
+ return lambda tipo: (
+ issubclass(tipo, klasses)
+ and not issubclass(tipo, (np.datetime64, np.timedelta64))
+ )
def is_object_dtype(arr_or_dtype):
@@ -267,7 +310,7 @@ def is_sparse(arr):
"""
from pandas.core.arrays.sparse import SparseDtype
- dtype = getattr(arr, 'dtype', arr)
+ dtype = getattr(arr, "dtype", arr)
return isinstance(dtype, SparseDtype)
@@ -385,9 +428,12 @@ def is_datetimetz(arr):
True
"""
- warnings.warn("'is_datetimetz' is deprecated and will be removed in a "
- "future version. Use 'is_datetime64tz_dtype' instead.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "'is_datetimetz' is deprecated and will be removed in a "
+ "future version. Use 'is_datetime64tz_dtype' instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
return is_datetime64tz_dtype(arr)
@@ -417,8 +463,7 @@ def is_offsetlike(arr_or_obj):
"""
if isinstance(arr_or_obj, ABCDateOffset):
return True
- elif (is_list_like(arr_or_obj) and len(arr_or_obj) and
- is_object_dtype(arr_or_obj)):
+ elif is_list_like(arr_or_obj) and len(arr_or_obj) and is_object_dtype(arr_or_obj):
return all(isinstance(x, ABCDateOffset) for x in arr_or_obj)
return False
@@ -449,9 +494,13 @@ def is_period(arr):
True
"""
- warnings.warn("'is_period' is deprecated and will be removed in a future "
- "version. Use 'is_period_dtype' or is_period_arraylike' "
- "instead.", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "'is_period' is deprecated and will be removed in a future "
+ "version. Use 'is_period_dtype' or is_period_arraylike' "
+ "instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
return isinstance(arr, ABCPeriodIndex) or is_period_arraylike(arr)
@@ -690,7 +739,8 @@ def is_string_dtype(arr_or_dtype):
# TODO: gh-15585: consider making the checks stricter.
def condition(dtype):
- return dtype.kind in ('O', 'S', 'U') and not is_period_dtype(dtype)
+ return dtype.kind in ("O", "S", "U") and not is_period_dtype(dtype)
+
return _is_dtype(arr_or_dtype, condition)
@@ -723,7 +773,7 @@ def is_period_arraylike(arr):
return True
elif isinstance(arr, (np.ndarray, ABCSeries)):
return is_period_dtype(arr.dtype)
- return getattr(arr, 'inferred_type', None) == 'period'
+ return getattr(arr, "inferred_type", None) == "period"
def is_datetime_arraylike(arr):
@@ -754,9 +804,11 @@ def is_datetime_arraylike(arr):
if isinstance(arr, ABCDatetimeIndex):
return True
elif isinstance(arr, (np.ndarray, ABCSeries)):
- return (is_object_dtype(arr.dtype)
- and lib.infer_dtype(arr, skipna=False) == 'datetime')
- return getattr(arr, 'inferred_type', None) == 'datetime'
+ return (
+ is_object_dtype(arr.dtype)
+ and lib.infer_dtype(arr, skipna=False) == "datetime"
+ )
+ return getattr(arr, "inferred_type", None) == "datetime"
def is_datetimelike(arr):
@@ -799,9 +851,12 @@ def is_datetimelike(arr):
True
"""
- return (is_datetime64_dtype(arr) or is_datetime64tz_dtype(arr) or
- is_timedelta64_dtype(arr) or
- isinstance(arr, ABCPeriodIndex))
+ return (
+ is_datetime64_dtype(arr)
+ or is_datetime64tz_dtype(arr)
+ or is_timedelta64_dtype(arr)
+ or isinstance(arr, ABCPeriodIndex)
+ )
def is_dtype_equal(source, target):
@@ -925,8 +980,7 @@ def is_any_int_dtype(arr_or_dtype):
False
"""
- return _is_dtype_type(
- arr_or_dtype, classes(np.integer, np.timedelta64))
+ return _is_dtype_type(arr_or_dtype, classes(np.integer, np.timedelta64))
def is_integer_dtype(arr_or_dtype):
@@ -981,8 +1035,7 @@ def is_integer_dtype(arr_or_dtype):
False
"""
- return _is_dtype_type(
- arr_or_dtype, classes_and_not_datetimelike(np.integer))
+ return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.integer))
def is_signed_integer_dtype(arr_or_dtype):
@@ -1039,8 +1092,7 @@ def is_signed_integer_dtype(arr_or_dtype):
False
"""
- return _is_dtype_type(
- arr_or_dtype, classes_and_not_datetimelike(np.signedinteger))
+ return _is_dtype_type(arr_or_dtype, classes_and_not_datetimelike(np.signedinteger))
def is_unsigned_integer_dtype(arr_or_dtype):
@@ -1088,7 +1140,8 @@ def is_unsigned_integer_dtype(arr_or_dtype):
True
"""
return _is_dtype_type(
- arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger))
+ arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger)
+ )
def is_int64_dtype(arr_or_dtype):
@@ -1179,8 +1232,7 @@ def is_datetime64_any_dtype(arr_or_dtype):
if arr_or_dtype is None:
return False
- return (is_datetime64_dtype(arr_or_dtype) or
- is_datetime64tz_dtype(arr_or_dtype))
+ return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype)
def is_datetime64_ns_dtype(arr_or_dtype):
@@ -1230,7 +1282,7 @@ def is_datetime64_ns_dtype(arr_or_dtype):
tipo = _get_dtype(arr_or_dtype.dtype)
else:
return False
- return tipo == _NS_DTYPE or getattr(tipo, 'base', None) == _NS_DTYPE
+ return tipo == _NS_DTYPE or getattr(tipo, "base", None) == _NS_DTYPE
def is_timedelta64_ns_dtype(arr_or_dtype):
@@ -1300,8 +1352,7 @@ def is_datetime_or_timedelta_dtype(arr_or_dtype):
True
"""
- return _is_dtype_type(
- arr_or_dtype, classes(np.datetime64, np.timedelta64))
+ return _is_dtype_type(arr_or_dtype, classes(np.datetime64, np.timedelta64))
def _is_unorderable_exception(e):
@@ -1325,7 +1376,7 @@ def _is_unorderable_exception(e):
if PY36:
return "'>' not supported between instances of" in str(e)
- return 'unorderable' in str(e)
+ return "unorderable" in str(e)
def is_numeric_v_string_like(a, b):
@@ -1380,10 +1431,12 @@ def is_numeric_v_string_like(a, b):
is_a_scalar_string_like = not is_a_array and is_string_like(a)
is_b_scalar_string_like = not is_b_array and is_string_like(b)
- return ((is_a_numeric_array and is_b_scalar_string_like) or
- (is_b_numeric_array and is_a_scalar_string_like) or
- (is_a_numeric_array and is_b_string_array) or
- (is_b_numeric_array and is_a_string_array))
+ return (
+ (is_a_numeric_array and is_b_scalar_string_like)
+ or (is_b_numeric_array and is_a_scalar_string_like)
+ or (is_a_numeric_array and is_b_string_array)
+ or (is_b_numeric_array and is_a_string_array)
+ )
def is_datetimelike_v_numeric(a, b):
@@ -1428,9 +1481,9 @@ def is_datetimelike_v_numeric(a, b):
False
"""
- if not hasattr(a, 'dtype'):
+ if not hasattr(a, "dtype"):
a = np.asarray(a)
- if not hasattr(b, 'dtype'):
+ if not hasattr(b, "dtype"):
b = np.asarray(b)
def is_numeric(x):
@@ -1440,8 +1493,9 @@ def is_numeric(x):
return is_integer_dtype(x) or is_float_dtype(x)
is_datetimelike = needs_i8_conversion
- return ((is_datetimelike(a) and is_numeric(b)) or
- (is_datetimelike(b) and is_numeric(a)))
+ return (is_datetimelike(a) and is_numeric(b)) or (
+ is_datetimelike(b) and is_numeric(a)
+ )
def is_datetimelike_v_object(a, b):
@@ -1487,14 +1541,15 @@ def is_datetimelike_v_object(a, b):
False
"""
- if not hasattr(a, 'dtype'):
+ if not hasattr(a, "dtype"):
a = np.asarray(a)
- if not hasattr(b, 'dtype'):
+ if not hasattr(b, "dtype"):
b = np.asarray(b)
is_datetimelike = needs_i8_conversion
- return ((is_datetimelike(a) and is_object_dtype(b)) or
- (is_datetimelike(b) and is_object_dtype(a)))
+ return (is_datetimelike(a) and is_object_dtype(b)) or (
+ is_datetimelike(b) and is_object_dtype(a)
+ )
def needs_i8_conversion(arr_or_dtype):
@@ -1534,9 +1589,11 @@ def needs_i8_conversion(arr_or_dtype):
if arr_or_dtype is None:
return False
- return (is_datetime_or_timedelta_dtype(arr_or_dtype) or
- is_datetime64tz_dtype(arr_or_dtype) or
- is_period_dtype(arr_or_dtype))
+ return (
+ is_datetime_or_timedelta_dtype(arr_or_dtype)
+ or is_datetime64tz_dtype(arr_or_dtype)
+ or is_period_dtype(arr_or_dtype)
+ )
def is_numeric_dtype(arr_or_dtype):
@@ -1578,7 +1635,8 @@ def is_numeric_dtype(arr_or_dtype):
"""
return _is_dtype_type(
- arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_))
+ arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_)
+ )
def is_string_like_dtype(arr_or_dtype):
@@ -1610,8 +1668,7 @@ def is_string_like_dtype(arr_or_dtype):
False
"""
- return _is_dtype(
- arr_or_dtype, lambda dtype: dtype.kind in ('S', 'U'))
+ return _is_dtype(arr_or_dtype, lambda dtype: dtype.kind in ("S", "U"))
def is_float_dtype(arr_or_dtype):
@@ -1705,10 +1762,9 @@ def is_bool_dtype(arr_or_dtype):
# we don't have a boolean Index class
# so its object, we need to infer to
# guess this
- return (arr_or_dtype.is_object and
- arr_or_dtype.inferred_type == 'boolean')
+ return arr_or_dtype.is_object and arr_or_dtype.inferred_type == "boolean"
elif is_extension_array_dtype(arr_or_dtype):
- dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
+ dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype)
return dtype._is_boolean
return issubclass(dtype.type, np.bool_)
@@ -1818,9 +1874,8 @@ def is_extension_array_dtype(arr_or_dtype):
>>> is_extension_array_dtype(arr.dtype)
False
"""
- dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)
- return (isinstance(dtype, ExtensionDtype) or
- registry.find(dtype) is not None)
+ dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype)
+ return isinstance(dtype, ExtensionDtype) or registry.find(dtype) is not None
def is_complex_dtype(arr_or_dtype):
@@ -1911,7 +1966,7 @@ def _get_dtype(arr_or_dtype):
return np.dtype(arr_or_dtype)
# if we have an array-like
- elif hasattr(arr_or_dtype, 'dtype'):
+ elif hasattr(arr_or_dtype, "dtype"):
arr_or_dtype = arr_or_dtype.dtype
return pandas_dtype(arr_or_dtype)
@@ -1944,7 +1999,7 @@ def _is_dtype_type(arr_or_dtype, condition):
return condition(np.dtype(arr_or_dtype).type)
# if we have an array-like
- if hasattr(arr_or_dtype, 'dtype'):
+ if hasattr(arr_or_dtype, "dtype"):
arr_or_dtype = arr_or_dtype.dtype
# we are not possibly a dtype
@@ -2005,13 +2060,13 @@ def infer_dtype_from_object(dtype):
# TODO(jreback)
# should deprecate these
- if dtype in ['datetimetz', 'datetime64tz']:
+ if dtype in ["datetimetz", "datetime64tz"]:
return DatetimeTZDtype.type
- elif dtype in ['period']:
+ elif dtype in ["period"]:
raise NotImplementedError
- if dtype == 'datetime' or dtype == 'timedelta':
- dtype += '64'
+ if dtype == "datetime" or dtype == "timedelta":
+ dtype += "64"
try:
return infer_dtype_from_object(getattr(np, dtype))
except (AttributeError, TypeError):
@@ -2045,9 +2100,9 @@ def _validate_date_like_dtype(dtype):
try:
typ = np.datetime_data(dtype)[0]
except ValueError as e:
- raise TypeError('{error}'.format(error=e))
- if typ != 'generic' and typ != 'ns':
- msg = '{name!r} is too specific of a frequency, try passing {type!r}'
+ raise TypeError("{error}".format(error=e))
+ if typ != "generic" and typ != "ns":
+ msg = "{name!r} is too specific of a frequency, try passing {type!r}"
raise ValueError(msg.format(name=dtype.name, type=dtype.type.__name__))
@@ -2086,19 +2141,18 @@ def pandas_dtype(dtype):
# we don't want to force a repr of the non-string
if not isinstance(dtype, str):
raise TypeError("data type not understood")
- raise TypeError("data type '{}' not understood".format(
- dtype))
+ raise TypeError("data type '{}' not understood".format(dtype))
# Any invalid dtype (such as pd.Timestamp) should raise an error.
# np.dtype(invalid_type).kind = 0 for such objects. However, this will
# also catch some valid dtypes such as object, np.object_ and 'object'
# which we safeguard against by catching them earlier and returning
# np.dtype(valid_dtype) before this condition is evaluated.
- if is_hashable(dtype) and dtype in [object, np.object_, 'object', 'O']:
+ if is_hashable(dtype) and dtype in [object, np.object_, "object", "O"]:
# check hashability to avoid errors/DeprecationWarning when we get
# here and `dtype` is an array
return npdtype
- elif npdtype.kind == 'O':
+ elif npdtype.kind == "O":
raise TypeError("dtype '{}' not understood".format(dtype))
return npdtype
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
index 66f7a6365fe41..ac74ad5726a99 100644
--- a/pandas/core/dtypes/concat.py
+++ b/pandas/core/dtypes/concat.py
@@ -7,12 +7,27 @@
from pandas._libs import tslib, tslibs
from pandas.core.dtypes.common import (
- _NS_DTYPE, _TD_DTYPE, is_bool_dtype, is_categorical_dtype,
- is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal,
- is_extension_array_dtype, is_object_dtype, is_sparse, is_timedelta64_dtype)
+ _NS_DTYPE,
+ _TD_DTYPE,
+ is_bool_dtype,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_object_dtype,
+ is_sparse,
+ is_timedelta64_dtype,
+)
from pandas.core.dtypes.generic import (
- ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex,
- ABCRangeIndex, ABCSparseDataFrame, ABCTimedeltaIndex)
+ ABCDatetimeArray,
+ ABCDatetimeIndex,
+ ABCIndexClass,
+ ABCPeriodIndex,
+ ABCRangeIndex,
+ ABCSparseDataFrame,
+ ABCTimedeltaIndex,
+)
def get_dtype_kinds(l):
@@ -31,23 +46,23 @@ def get_dtype_kinds(l):
dtype = arr.dtype
if is_categorical_dtype(dtype):
- typ = 'category'
+ typ = "category"
elif is_sparse(arr):
- typ = 'sparse'
+ typ = "sparse"
elif isinstance(arr, ABCRangeIndex):
- typ = 'range'
+ typ = "range"
elif is_datetime64tz_dtype(arr):
# if to_concat contains different tz,
# the result must be object dtype
typ = str(arr.dtype)
elif is_datetime64_dtype(dtype):
- typ = 'datetime'
+ typ = "datetime"
elif is_timedelta64_dtype(dtype):
- typ = 'timedelta'
+ typ = "timedelta"
elif is_object_dtype(dtype):
- typ = 'object'
+ typ = "object"
elif is_bool_dtype(dtype):
- typ = 'bool'
+ typ = "bool"
elif is_extension_array_dtype(dtype):
typ = str(arr.dtype)
else:
@@ -66,8 +81,7 @@ def _get_series_result_type(result, objs=None):
# concat Series with axis 1
if isinstance(result, dict):
# concat Series with axis 1
- if all(isinstance(c, (SparseSeries, SparseDataFrame))
- for c in result.values()):
+ if all(isinstance(c, (SparseSeries, SparseDataFrame)) for c in result.values()):
return SparseDataFrame
else:
return DataFrame
@@ -83,13 +97,12 @@ def _get_frame_result_type(result, objs):
otherwise, return 1st obj
"""
- if (result.blocks and (
- any(isinstance(obj, ABCSparseDataFrame) for obj in objs))):
+ if result.blocks and (any(isinstance(obj, ABCSparseDataFrame) for obj in objs)):
from pandas.core.sparse.api import SparseDataFrame
+
return SparseDataFrame
else:
- return next(obj for obj in objs if not isinstance(obj,
- ABCSparseDataFrame))
+ return next(obj for obj in objs if not isinstance(obj, ABCSparseDataFrame))
def _concat_compat(to_concat, axis=0):
@@ -125,24 +138,24 @@ def is_nonempty(x):
# np.concatenate which has them both implemented is compiled.
typs = get_dtype_kinds(to_concat)
- _contains_datetime = any(typ.startswith('datetime') for typ in typs)
- _contains_period = any(typ.startswith('period') for typ in typs)
+ _contains_datetime = any(typ.startswith("datetime") for typ in typs)
+ _contains_period = any(typ.startswith("period") for typ in typs)
- if 'category' in typs:
+ if "category" in typs:
# this must be prior to _concat_datetime,
# to support Categorical + datetime-like
return _concat_categorical(to_concat, axis=axis)
- elif _contains_datetime or 'timedelta' in typs or _contains_period:
+ elif _contains_datetime or "timedelta" in typs or _contains_period:
return _concat_datetime(to_concat, axis=axis, typs=typs)
# these are mandated to handle empties as well
- elif 'sparse' in typs:
+ elif "sparse" in typs:
return _concat_sparse(to_concat, axis=axis, typs=typs)
all_empty = all(not is_nonempty(x) for x in to_concat)
if any(is_extension_array_dtype(x) for x in to_concat) and axis == 1:
- to_concat = [np.atleast_2d(x.astype('object')) for x in to_concat]
+ to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
if all_empty:
# we have all empties, but may need to coerce the result dtype to
@@ -151,13 +164,12 @@ def is_nonempty(x):
typs = get_dtype_kinds(to_concat)
if len(typs) != 1:
- if (not len(typs - {'i', 'u', 'f'}) or
- not len(typs - {'bool', 'i', 'u'})):
+ if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}):
# let numpy coerce
pass
else:
# coerce to object
- to_concat = [x.astype('object') for x in to_concat]
+ to_concat = [x.astype("object") for x in to_concat]
return np.concatenate(to_concat, axis=axis)
@@ -194,9 +206,14 @@ def _concat_categorical(to_concat, axis=0):
return union_categoricals(categoricals)
# extract the categoricals & coerce to object if needed
- to_concat = [x._internal_get_values() if is_categorical_dtype(x.dtype)
- else np.asarray(x).ravel() if not is_datetime64tz_dtype(x)
- else np.asarray(x.astype(object)) for x in to_concat]
+ to_concat = [
+ x._internal_get_values()
+ if is_categorical_dtype(x.dtype)
+ else np.asarray(x).ravel()
+ if not is_datetime64tz_dtype(x)
+ else np.asarray(x.astype(object))
+ for x in to_concat
+ ]
result = _concat_compat(to_concat)
if axis == 1:
result = result.reshape(1, len(result))
@@ -309,7 +326,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False):
from pandas.core.arrays.categorical import _recode_for_categories
if len(to_union) == 0:
- raise ValueError('No Categoricals to union')
+ raise ValueError("No Categoricals to union")
def _maybe_unwrap(x):
if isinstance(x, (CategoricalIndex, Series)):
@@ -322,8 +339,10 @@ def _maybe_unwrap(x):
to_union = [_maybe_unwrap(x) for x in to_union]
first = to_union[0]
- if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)
- for other in to_union[1:]):
+ if not all(
+ is_dtype_equal(other.categories.dtype, first.categories.dtype)
+ for other in to_union[1:]
+ ):
raise TypeError("dtype of categories must be the same")
ordered = False
@@ -332,25 +351,26 @@ def _maybe_unwrap(x):
categories = first.categories
ordered = first.ordered
- if all(first.categories.equals(other.categories)
- for other in to_union[1:]):
+ if all(first.categories.equals(other.categories) for other in to_union[1:]):
new_codes = np.concatenate([c.codes for c in to_union])
else:
- codes = [first.codes] + [_recode_for_categories(other.codes,
- other.categories,
- first.categories)
- for other in to_union[1:]]
+ codes = [first.codes] + [
+ _recode_for_categories(other.codes, other.categories, first.categories)
+ for other in to_union[1:]
+ ]
new_codes = np.concatenate(codes)
if sort_categories and not ignore_order and ordered:
- raise TypeError("Cannot use sort_categories=True with "
- "ordered Categoricals")
+ raise TypeError(
+ "Cannot use sort_categories=True with " "ordered Categoricals"
+ )
if sort_categories and not categories.is_monotonic_increasing:
categories = categories.sort_values()
indexer = categories.get_indexer(first.categories)
from pandas.core.algorithms import take_1d
+
new_codes = take_1d(indexer, new_codes, fill_value=-1)
elif ignore_order or all(not c.ordered for c in to_union):
# different categories - union and recode
@@ -359,23 +379,22 @@ def _maybe_unwrap(x):
if sort_categories:
categories = categories.sort_values()
- new_codes = [_recode_for_categories(c.codes, c.categories, categories)
- for c in to_union]
+ new_codes = [
+ _recode_for_categories(c.codes, c.categories, categories) for c in to_union
+ ]
new_codes = np.concatenate(new_codes)
else:
# ordered - to show a proper error message
if all(c.ordered for c in to_union):
- msg = ("to union ordered Categoricals, "
- "all categories must be the same")
+ msg = "to union ordered Categoricals, " "all categories must be the same"
raise TypeError(msg)
else:
- raise TypeError('Categorical.ordered must be the same')
+ raise TypeError("Categorical.ordered must be the same")
if ignore_order:
ordered = False
- return Categorical(new_codes, categories=categories, ordered=ordered,
- fastpath=True)
+ return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def _concatenate_2d(to_concat, axis):
@@ -406,14 +425,14 @@ def _concat_datetime(to_concat, axis=0, typs=None):
# multiple types, need to coerce to object
if len(typs) != 1:
- return _concatenate_2d([_convert_datetimelike_to_object(x)
- for x in to_concat],
- axis=axis)
+ return _concatenate_2d(
+ [_convert_datetimelike_to_object(x) for x in to_concat], axis=axis
+ )
# must be single dtype
- if any(typ.startswith('datetime') for typ in typs):
+ if any(typ.startswith("datetime") for typ in typs):
- if 'datetime' in typs:
+ if "datetime" in typs:
to_concat = [x.astype(np.int64, copy=False) for x in to_concat]
return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE)
else:
@@ -421,11 +440,12 @@ def _concat_datetime(to_concat, axis=0, typs=None):
# thus no need to care
return _concat_datetimetz(to_concat)
- elif 'timedelta' in typs:
- return _concatenate_2d([x.view(np.int64) for x in to_concat],
- axis=axis).view(_TD_DTYPE)
+ elif "timedelta" in typs:
+ return _concatenate_2d([x.view(np.int64) for x in to_concat], axis=axis).view(
+ _TD_DTYPE
+ )
- elif any(typ.startswith('period') for typ in typs):
+ elif any(typ.startswith("period") for typ in typs):
assert len(typs) == 1
cls = to_concat[0]
new_values = cls._concat_same_type(to_concat)
@@ -437,12 +457,11 @@ def _convert_datetimelike_to_object(x):
# if dtype is of datetimetz or timezone
if x.dtype.kind == _NS_DTYPE.kind:
- if getattr(x, 'tz', None) is not None:
+ if getattr(x, "tz", None) is not None:
x = np.asarray(x.astype(object))
else:
shape = x.shape
- x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(),
- box="timestamp")
+ x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), box="timestamp")
x = x.reshape(shape)
elif x.dtype == _TD_DTYPE:
@@ -483,17 +502,14 @@ def _concat_index_asobject(to_concat, name=None):
from pandas import Index
from pandas.core.arrays import ExtensionArray
- klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex,
- ExtensionArray)
- to_concat = [x.astype(object) if isinstance(x, klasses) else x
- for x in to_concat]
+ klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, ExtensionArray)
+ to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat]
self = to_concat[0]
attribs = self._get_attributes_dict()
- attribs['name'] = name
+ attribs["name"] = name
- to_concat = [x._values if isinstance(x, Index) else x
- for x in to_concat]
+ to_concat = [x._values if isinstance(x, Index) else x for x in to_concat]
return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs)
@@ -516,14 +532,16 @@ def _concat_sparse(to_concat, axis=0, typs=None):
from pandas.core.arrays import SparseArray
- fill_values = [x.fill_value for x in to_concat
- if isinstance(x, SparseArray)]
+ fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)]
fill_value = fill_values[0]
# TODO: Fix join unit generation so we aren't passed this.
- to_concat = [x if isinstance(x, SparseArray)
- else SparseArray(x.squeeze(), fill_value=fill_value)
- for x in to_concat]
+ to_concat = [
+ x
+ if isinstance(x, SparseArray)
+ else SparseArray(x.squeeze(), fill_value=fill_value)
+ for x in to_concat
+ ]
return SparseArray._concat_same_type(to_concat)
@@ -557,8 +575,9 @@ def _concat_rangeindex_same_dtype(indexes):
return _concat_index_same_dtype(indexes, klass=Int64Index)
step = rng.start - start
- non_consecutive = ((step != rng.step and len(rng) > 1) or
- (next_ is not None and rng.start != next_))
+ non_consecutive = (step != rng.step and len(rng) > 1) or (
+ next_ is not None and rng.start != next_
+ )
if non_consecutive:
return _concat_index_same_dtype(indexes, klass=Int64Index)
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index d8d910a16e32a..1cf452b4a6c2c 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -9,8 +9,7 @@
from pandas._libs.interval import Interval
from pandas._libs.tslibs import NaT, Period, Timestamp, timezones
-from pandas.core.dtypes.generic import (
- ABCCategoricalIndex, ABCDateOffset, ABCIndexClass)
+from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass
from .base import ExtensionDtype
from .inference import is_list_like
@@ -25,8 +24,7 @@
OrderedType = Union[None, bool, object]
-def register_extension_dtype(cls: Type[ExtensionDtype],
- ) -> Type[ExtensionDtype]:
+def register_extension_dtype(cls: Type[ExtensionDtype],) -> Type[ExtensionDtype]:
"""
Register an ExtensionType with pandas as class decorator.
@@ -67,6 +65,7 @@ class Registry:
Multiple extension types can be registered.
These are tried in order.
"""
+
def __init__(self):
self.dtypes = [] # type: List[Type[ExtensionDtype]]
@@ -81,9 +80,9 @@ def register(self, dtype: Type[ExtensionDtype]) -> None:
self.dtypes.append(dtype)
- def find(self,
- dtype: Union[Type[ExtensionDtype], str],
- ) -> Optional[Type[ExtensionDtype]]:
+ def find(
+ self, dtype: Union[Type[ExtensionDtype], str]
+ ) -> Optional[Type[ExtensionDtype]]:
"""
Parameters
----------
@@ -120,6 +119,7 @@ class PandasExtensionDtype(ExtensionDtype):
THIS IS NOT A REAL NUMPY DTYPE
"""
+
type = None # type: Any
kind = None # type: Any
# The Any type annotations above are here only because mypy seems to have a
@@ -149,8 +149,7 @@ def __repr__(self) -> str_type:
return str(self)
def __hash__(self) -> int:
- raise NotImplementedError("sub-classes should implement an __hash__ "
- "method")
+ raise NotImplementedError("sub-classes should implement an __hash__ " "method")
def __getstate__(self) -> Dict[str_type, Any]:
# pickle support; we don't want to pickle the cache
@@ -166,6 +165,7 @@ class CategoricalDtypeType(type):
"""
the type of CategoricalDtype, this metaclass determines subclass ability
"""
+
pass
@@ -212,35 +212,31 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
dtype: category
Categories (2, object): [b < a]
"""
+
# TODO: Document public vs. private API
- name = 'category'
+ name = "category"
type = CategoricalDtypeType # type: Type[CategoricalDtypeType]
- kind = 'O' # type: str_type
- str = '|O08'
- base = np.dtype('O')
- _metadata = ('categories', 'ordered')
+ kind = "O" # type: str_type
+ str = "|O08"
+ base = np.dtype("O")
+ _metadata = ("categories", "ordered")
_cache = {} # type: Dict[str_type, PandasExtensionDtype]
- def __init__(self,
- categories=None,
- ordered: OrderedType = ordered_sentinel):
+ def __init__(self, categories=None, ordered: OrderedType = ordered_sentinel):
self._finalize(categories, ordered, fastpath=False)
@classmethod
- def _from_fastpath(cls,
- categories=None,
- ordered: Optional[bool] = None
- ) -> 'CategoricalDtype':
+ def _from_fastpath(
+ cls, categories=None, ordered: Optional[bool] = None
+ ) -> "CategoricalDtype":
self = cls.__new__(cls)
self._finalize(categories, ordered, fastpath=True)
return self
@classmethod
- def _from_categorical_dtype(cls,
- dtype: 'CategoricalDtype',
- categories=None,
- ordered: OrderedType = None,
- ) -> 'CategoricalDtype':
+ def _from_categorical_dtype(
+ cls, dtype: "CategoricalDtype", categories=None, ordered: OrderedType = None
+ ) -> "CategoricalDtype":
if categories is ordered is None:
return dtype
if categories is None:
@@ -250,12 +246,13 @@ def _from_categorical_dtype(cls,
return cls(categories, ordered)
@classmethod
- def _from_values_or_dtype(cls,
- values=None,
- categories=None,
- ordered: Optional[bool] = None,
- dtype: Optional['CategoricalDtype'] = None,
- ) -> 'CategoricalDtype':
+ def _from_values_or_dtype(
+ cls,
+ values=None,
+ categories=None,
+ ordered: Optional[bool] = None,
+ dtype: Optional["CategoricalDtype"] = None,
+ ) -> "CategoricalDtype":
"""
Construct dtype from the input parameters used in :class:`Categorical`.
@@ -316,19 +313,21 @@ def _from_values_or_dtype(cls,
if dtype is not None:
# The dtype argument takes precedence over values.dtype (if any)
if isinstance(dtype, str):
- if dtype == 'category':
+ if dtype == "category":
dtype = CategoricalDtype(categories, ordered)
else:
msg = "Unknown dtype {dtype!r}"
raise ValueError(msg.format(dtype=dtype))
elif categories is not None or ordered is not None:
- raise ValueError("Cannot specify `categories` or `ordered` "
- "together with `dtype`.")
+ raise ValueError(
+ "Cannot specify `categories` or `ordered` " "together with `dtype`."
+ )
elif is_categorical(values):
# If no "dtype" was passed, use the one from "values", but honor
# the "ordered" and "categories" arguments
- dtype = values.dtype._from_categorical_dtype(values.dtype,
- categories, ordered)
+ dtype = values.dtype._from_categorical_dtype(
+ values.dtype, categories, ordered
+ )
else:
# If dtype=None and values is not categorical, create a new dtype.
# Note: This could potentially have categories=None and
@@ -337,18 +336,15 @@ def _from_values_or_dtype(cls,
return dtype
- def _finalize(self,
- categories,
- ordered: OrderedType,
- fastpath: bool = False,
- ) -> None:
+ def _finalize(
+ self, categories, ordered: OrderedType, fastpath: bool = False
+ ) -> None:
if ordered is not None and ordered is not ordered_sentinel:
self.validate_ordered(ordered)
if categories is not None:
- categories = self.validate_categories(categories,
- fastpath=fastpath)
+ categories = self.validate_categories(categories, fastpath=fastpath)
self._categories = categories
self._ordered = ordered if ordered is not ordered_sentinel else None
@@ -358,8 +354,8 @@ def __setstate__(self, state: Dict[str_type, Any]) -> None:
# for pickle compat. __get_state__ is defined in the
# PandasExtensionDtype superclass and uses the public properties to
# pickle -> need to set the settable private ones here (see GH26067)
- self._categories = state.pop('categories', None)
- self._ordered = state.pop('ordered', False)
+ self._categories = state.pop("categories", None)
+ self._ordered = state.pop("ordered", False)
def __hash__(self) -> int:
# _hash_categories returns a uint64, so use the negative
@@ -389,7 +385,7 @@ def __eq__(self, other: Any) -> bool:
return other == self.name
elif other is self:
return True
- elif not (hasattr(other, '_ordered') and hasattr(other, 'categories')):
+ elif not (hasattr(other, "_ordered") and hasattr(other, "categories")):
return False
elif self.categories is None or other.categories is None:
# We're forced into a suboptimal corner thanks to math and
@@ -401,8 +397,9 @@ def __eq__(self, other: Any) -> bool:
elif self._ordered or other._ordered:
# At least one has ordered=True; equal if both have ordered=True
# and the same values for categories in the same order.
- return ((self._ordered == other._ordered) and
- self.categories.equals(other.categories))
+ return (self._ordered == other._ordered) and self.categories.equals(
+ other.categories
+ )
else:
# Neither has ordered=True; equal if both have the same categories,
# but same order is not necessary. There is no distinction between
@@ -411,7 +408,7 @@ def __eq__(self, other: Any) -> bool:
return hash(self) == hash(other)
def __repr__(self):
- tpl = 'CategoricalDtype(categories={}ordered={})'
+ tpl = "CategoricalDtype(categories={}ordered={})"
if self.categories is None:
data = "None, "
else:
@@ -421,7 +418,9 @@ def __repr__(self):
@staticmethod
def _hash_categories(categories, ordered: OrderedType = True) -> int:
from pandas.core.util.hashing import (
- hash_array, _combine_hash_arrays, hash_tuples
+ hash_array,
+ _combine_hash_arrays,
+ hash_tuples,
)
from pandas.core.dtypes.common import is_datetime64tz_dtype, _NS_DTYPE
@@ -432,7 +431,7 @@ def _hash_categories(categories, ordered: OrderedType = True) -> int:
categories = list(categories) # breaks if a np.array of categories
cat_array = hash_tuples(categories)
else:
- if categories.dtype == 'O':
+ if categories.dtype == "O":
if len({type(x) for x in categories}) != 1:
# TODO: hash_array doesn't handle mixed types. It casts
# everything to a str first, which means we treat
@@ -447,13 +446,12 @@ def _hash_categories(categories, ordered: OrderedType = True) -> int:
cat_array = hash_array(np.asarray(categories), categorize=False)
if ordered:
- cat_array = np.vstack([
- cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)
- ])
+ cat_array = np.vstack(
+ [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)]
+ )
else:
cat_array = [cat_array]
- hashed = _combine_hash_arrays(iter(cat_array),
- num_items=len(cat_array))
+ hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array))
return np.bitwise_xor.reduce(hashed)
@classmethod
@@ -466,6 +464,7 @@ def construct_array_type(cls):
type
"""
from pandas import Categorical
+
return Categorical
@staticmethod
@@ -485,6 +484,7 @@ def validate_ordered(ordered: OrderedType) -> None:
If 'ordered' is not a boolean.
"""
from pandas.core.dtypes.common import is_bool
+
if not is_bool(ordered):
raise TypeError("'ordered' must either be 'True' or 'False'")
@@ -514,17 +514,17 @@ def validate_categories(categories, fastpath: bool = False):
if not fastpath:
if categories.hasnans:
- raise ValueError('Categorial categories cannot be null')
+ raise ValueError("Categorial categories cannot be null")
if not categories.is_unique:
- raise ValueError('Categorical categories must be unique')
+ raise ValueError("Categorical categories must be unique")
if isinstance(categories, ABCCategoricalIndex):
categories = categories.categories
return categories
- def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype':
+ def update_dtype(self, dtype: "CategoricalDtype") -> "CategoricalDtype":
"""
Returns a CategoricalDtype with categories and ordered taken from dtype
if specified, otherwise falling back to self if unspecified
@@ -537,12 +537,14 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype':
-------
new_dtype : CategoricalDtype
"""
- if isinstance(dtype, str) and dtype == 'category':
+ if isinstance(dtype, str) and dtype == "category":
# dtype='category' should not change anything
return self
elif not self.is_dtype(dtype):
- msg = ('a CategoricalDtype must be passed to perform an update, '
- 'got {dtype!r}').format(dtype=dtype)
+ msg = (
+ "a CategoricalDtype must be passed to perform an update, "
+ "got {dtype!r}"
+ ).format(dtype=dtype)
raise ValueError(msg)
# dtype is CDT: keep current categories/ordered if None
@@ -557,11 +559,13 @@ def update_dtype(self, dtype: 'CategoricalDtype') -> 'CategoricalDtype':
new_ordered = self._ordered
if self._ordered and new_ordered_from_sentinel:
# only warn if we'd actually change the existing behavior
- msg = ("Constructing a CategoricalDtype without specifying "
- "`ordered` will default to `ordered=False` in a future "
- "version, which will cause the resulting categorical's "
- "`ordered` attribute to change to False; `ordered=True`"
- " must be explicitly passed in order to be retained")
+ msg = (
+ "Constructing a CategoricalDtype without specifying "
+ "`ordered` will default to `ordered=False` in a future "
+ "version, which will cause the resulting categorical's "
+ "`ordered` attribute to change to False; `ordered=True`"
+ " must be explicitly passed in order to be retained"
+ )
warnings.warn(msg, FutureWarning, stacklevel=3)
return CategoricalDtype(new_categories, new_ordered)
@@ -582,9 +586,11 @@ def ordered(self) -> OrderedType:
if self._ordered_from_sentinel and self._ordered is None:
# warn when accessing ordered if ordered=None and None was not
# explicitly passed to the constructor
- msg = ("Constructing a CategoricalDtype without specifying "
- "`ordered` will default to `ordered=False` in a future "
- "version; `ordered=None` must be explicitly passed.")
+ msg = (
+ "Constructing a CategoricalDtype without specifying "
+ "`ordered` will default to `ordered=False` in a future "
+ "version; `ordered=None` must be explicitly passed."
+ )
warnings.warn(msg, FutureWarning, stacklevel=2)
return self._ordered
@@ -632,13 +638,14 @@ class DatetimeTZDtype(PandasExtensionDtype):
>>> pd.DatetimeTZDtype(tz='dateutil/US/Central')
datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')]
"""
+
type = Timestamp # type: Type[Timestamp]
- kind = 'M' # type: str_type
- str = '|M8[ns]'
+ kind = "M" # type: str_type
+ str = "|M8[ns]"
num = 101
- base = np.dtype('M8[ns]')
+ base = np.dtype("M8[ns]")
na_value = NaT
- _metadata = ('unit', 'tz')
+ _metadata = ("unit", "tz")
_match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]")
_cache = {} # type: Dict[str_type, PandasExtensionDtype]
@@ -646,7 +653,7 @@ def __init__(self, unit="ns", tz=None):
if isinstance(unit, DatetimeTZDtype):
unit, tz = unit.unit, unit.tz
- if unit != 'ns':
+ if unit != "ns":
if isinstance(unit, str) and tz is None:
# maybe a string like datetime64[ns, tz], which we support for
# now.
@@ -697,6 +704,7 @@ def construct_array_type(cls):
type
"""
from pandas.core.arrays import DatetimeArray
+
return DatetimeArray
@classmethod
@@ -722,7 +730,7 @@ def construct_from_string(cls, string):
match = cls._match.match(string)
if match:
d = match.groupdict()
- return cls(unit=d['unit'], tz=d['tz'])
+ return cls(unit=d["unit"], tz=d["tz"])
except Exception:
# TODO(py3): Change this pass to `raise TypeError(msg) from e`
pass
@@ -747,16 +755,18 @@ def __eq__(self, other):
if isinstance(other, str):
return other == self.name
- return (isinstance(other, DatetimeTZDtype) and
- self.unit == other.unit and
- str(self.tz) == str(other.tz))
+ return (
+ isinstance(other, DatetimeTZDtype)
+ and self.unit == other.unit
+ and str(self.tz) == str(other.tz)
+ )
def __setstate__(self, state):
# for pickle compat. __get_state__ is defined in the
# PandasExtensionDtype superclass and uses the public properties to
# pickle -> need to set the settable private ones here (see GH26067)
- self._tz = state['tz']
- self._unit = state['unit']
+ self._tz = state["tz"]
+ self._unit = state["unit"]
@register_extension_dtype
@@ -787,12 +797,13 @@ class PeriodDtype(PandasExtensionDtype):
>>> pd.PeriodDtype(freq=pd.offsets.MonthEnd())
period[M]
"""
+
type = Period # type: Type[Period]
- kind = 'O' # type: str_type
- str = '|O08'
- base = np.dtype('O')
+ kind = "O" # type: str_type
+ str = "|O08"
+ base = np.dtype("O")
num = 102
- _metadata = ('freq',)
+ _metadata = ("freq",)
_match = re.compile(r"(P|p)eriod\[(?P.+)\]")
_cache = {} # type: Dict[str_type, PandasExtensionDtype]
@@ -833,11 +844,12 @@ def freq(self):
@classmethod
def _parse_dtype_strict(cls, freq):
if isinstance(freq, str):
- if freq.startswith('period[') or freq.startswith('Period['):
+ if freq.startswith("period[") or freq.startswith("Period["):
m = cls._match.search(freq)
if m is not None:
- freq = m.group('freq')
+ freq = m.group("freq")
from pandas.tseries.frequencies import to_offset
+
freq = to_offset(freq)
if freq is not None:
return freq
@@ -850,10 +862,11 @@ def construct_from_string(cls, string):
Strict construction from a string, raise a TypeError if not
possible
"""
- if (isinstance(string, str) and
- (string.startswith('period[') or
- string.startswith('Period[')) or
- isinstance(string, ABCDateOffset)):
+ if (
+ isinstance(string, str)
+ and (string.startswith("period[") or string.startswith("Period["))
+ or isinstance(string, ABCDateOffset)
+ ):
# do not parse string like U as period[U]
# avoid tuple to be regarded as freq
try:
@@ -887,7 +900,7 @@ def __setstate__(self, state):
# for pickle compat. __get_state__ is defined in the
# PandasExtensionDtype superclass and uses the public properties to
# pickle -> need to set the settable private ones here (see GH26067)
- self._freq = state['freq']
+ self._freq = state["freq"]
@classmethod
def is_dtype(cls, dtype):
@@ -899,7 +912,7 @@ def is_dtype(cls, dtype):
if isinstance(dtype, str):
# PeriodDtype can be instantiated from freq string like "U",
# but doesn't regard freq str like "U" as dtype.
- if dtype.startswith('period[') or dtype.startswith('Period['):
+ if dtype.startswith("period[") or dtype.startswith("Period["):
try:
if cls._parse_dtype_strict(dtype) is not None:
return True
@@ -943,18 +956,22 @@ class IntervalDtype(PandasExtensionDtype):
>>> pd.IntervalDtype(subtype='int64')
interval[int64]
"""
- name = 'interval'
+
+ name = "interval"
kind = None # type: Optional[str_type]
- str = '|O08'
- base = np.dtype('O')
+ str = "|O08"
+ base = np.dtype("O")
num = 103
- _metadata = ('subtype',)
+ _metadata = ("subtype",)
_match = re.compile(r"(I|i)nterval\[(?P.+)\]")
_cache = {} # type: Dict[str_type, PandasExtensionDtype]
def __new__(cls, subtype=None):
from pandas.core.dtypes.common import (
- is_categorical_dtype, is_string_dtype, pandas_dtype)
+ is_categorical_dtype,
+ is_string_dtype,
+ pandas_dtype,
+ )
if isinstance(subtype, IntervalDtype):
return subtype
@@ -964,14 +981,13 @@ def __new__(cls, subtype=None):
u = object.__new__(cls)
u._subtype = None
return u
- elif (isinstance(subtype, str) and
- subtype.lower() == 'interval'):
+ elif isinstance(subtype, str) and subtype.lower() == "interval":
subtype = None
else:
if isinstance(subtype, str):
m = cls._match.search(subtype)
if m is not None:
- subtype = m.group('subtype')
+ subtype = m.group("subtype")
try:
subtype = pandas_dtype(subtype)
@@ -980,8 +996,10 @@ def __new__(cls, subtype=None):
if is_categorical_dtype(subtype) or is_string_dtype(subtype):
# GH 19016
- msg = ('category, object, and string subtypes are not supported '
- 'for IntervalDtype')
+ msg = (
+ "category, object, and string subtypes are not supported "
+ "for IntervalDtype"
+ )
raise TypeError(msg)
try:
@@ -1009,6 +1027,7 @@ def construct_array_type(cls):
type
"""
from pandas.core.arrays import IntervalArray
+
return IntervalArray
@classmethod
@@ -1021,13 +1040,14 @@ def construct_from_string(cls, string):
msg = "a string needs to be passed, got type {typ}"
raise TypeError(msg.format(typ=type(string)))
- if (string.lower() == 'interval' or
- cls._match.search(string) is not None):
+ if string.lower() == "interval" or cls._match.search(string) is not None:
return cls(string)
- msg = ('Incorrectly formatted string passed to constructor. '
- 'Valid formats include Interval or Interval[dtype] '
- 'where dtype is numeric, datetime, or timedelta')
+ msg = (
+ "Incorrectly formatted string passed to constructor. "
+ "Valid formats include Interval or Interval[dtype] "
+ "where dtype is numeric, datetime, or timedelta"
+ )
raise TypeError(msg)
@property
@@ -1053,13 +1073,14 @@ def __eq__(self, other):
return True
else:
from pandas.core.dtypes.common import is_dtype_equal
+
return is_dtype_equal(self.subtype, other.subtype)
def __setstate__(self, state):
# for pickle compat. __get_state__ is defined in the
# PandasExtensionDtype superclass and uses the public properties to
# pickle -> need to set the settable private ones here (see GH26067)
- self._subtype = state['subtype']
+ self._subtype = state["subtype"]
@classmethod
def is_dtype(cls, dtype):
@@ -1069,7 +1090,7 @@ def is_dtype(cls, dtype):
"""
if isinstance(dtype, str):
- if dtype.lower().startswith('interval'):
+ if dtype.lower().startswith("interval"):
try:
if cls.construct_from_string(dtype) is not None:
return True
diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py
index 86aff93dfde14..de41644f09b66 100644
--- a/pandas/core/dtypes/generic.py
+++ b/pandas/core/dtypes/generic.py
@@ -6,76 +6,79 @@
def create_pandas_abc_type(name, attr, comp):
@classmethod
def _check(cls, inst):
- return getattr(inst, attr, '_typ') in comp
+ return getattr(inst, attr, "_typ") in comp
dct = dict(__instancecheck__=_check, __subclasscheck__=_check)
- meta = type("ABCBase", (type, ), dct)
+ meta = type("ABCBase", (type,), dct)
return meta(name, tuple(), dct)
-ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index", ))
-ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ",
- ("int64index", ))
-ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ",
- ("uint64index", ))
-ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ",
- ("rangeindex", ))
-ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ",
- ("float64index", ))
-ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ",
- ("multiindex", ))
-ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ",
- ("datetimeindex", ))
-ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ",
- ("timedeltaindex", ))
-ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ",
- ("periodindex", ))
-ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ",
- ("categoricalindex", ))
-ABCIntervalIndex = create_pandas_abc_type("ABCIntervalIndex", "_typ",
- ("intervalindex", ))
-ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ",
- ("index", "int64index", "rangeindex",
- "float64index", "uint64index",
- "multiindex", "datetimeindex",
- "timedeltaindex", "periodindex",
- "categoricalindex", "intervalindex"))
+ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",))
+ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",))
+ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",))
+ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",))
+ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",))
+ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",))
+ABCDatetimeIndex = create_pandas_abc_type(
+ "ABCDatetimeIndex", "_typ", ("datetimeindex",)
+)
+ABCTimedeltaIndex = create_pandas_abc_type(
+ "ABCTimedeltaIndex", "_typ", ("timedeltaindex",)
+)
+ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",))
+ABCCategoricalIndex = create_pandas_abc_type(
+ "ABCCategoricalIndex", "_typ", ("categoricalindex",)
+)
+ABCIntervalIndex = create_pandas_abc_type(
+ "ABCIntervalIndex", "_typ", ("intervalindex",)
+)
+ABCIndexClass = create_pandas_abc_type(
+ "ABCIndexClass",
+ "_typ",
+ (
+ "index",
+ "int64index",
+ "rangeindex",
+ "float64index",
+ "uint64index",
+ "multiindex",
+ "datetimeindex",
+ "timedeltaindex",
+ "periodindex",
+ "categoricalindex",
+ "intervalindex",
+ ),
+)
-ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", ))
-ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", ))
-ABCSparseDataFrame = create_pandas_abc_type("ABCSparseDataFrame", "_subtyp",
- ("sparse_frame", ))
-ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp",
- ('sparse_series',
- 'sparse_time_series'))
-ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp",
- ('sparse_array', 'sparse_series'))
-ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ",
- ("categorical"))
-ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ",
- ("datetimearray"))
-ABCTimedeltaArray = create_pandas_abc_type("ABCTimedeltaArray", "_typ",
- ("timedeltaarray"))
-ABCPeriodArray = create_pandas_abc_type("ABCPeriodArray", "_typ",
- ("periodarray", ))
-ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period", ))
-ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ",
- ("dateoffset",))
-ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval", ))
-ABCExtensionArray = create_pandas_abc_type("ABCExtensionArray", "_typ",
- ("extension",
- "categorical",
- "periodarray",
- "datetimearray",
- "timedeltaarray",
- ))
-ABCPandasArray = create_pandas_abc_type("ABCPandasArray",
- "_typ",
- ("npy_extension",))
+ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",))
+ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",))
+ABCSparseDataFrame = create_pandas_abc_type(
+ "ABCSparseDataFrame", "_subtyp", ("sparse_frame",)
+)
+ABCSparseSeries = create_pandas_abc_type(
+ "ABCSparseSeries", "_subtyp", ("sparse_series", "sparse_time_series")
+)
+ABCSparseArray = create_pandas_abc_type(
+ "ABCSparseArray", "_subtyp", ("sparse_array", "sparse_series")
+)
+ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", ("categorical"))
+ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray"))
+ABCTimedeltaArray = create_pandas_abc_type(
+ "ABCTimedeltaArray", "_typ", ("timedeltaarray")
+)
+ABCPeriodArray = create_pandas_abc_type("ABCPeriodArray", "_typ", ("periodarray",))
+ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period",))
+ABCDateOffset = create_pandas_abc_type("ABCDateOffset", "_typ", ("dateoffset",))
+ABCInterval = create_pandas_abc_type("ABCInterval", "_typ", ("interval",))
+ABCExtensionArray = create_pandas_abc_type(
+ "ABCExtensionArray",
+ "_typ",
+ ("extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"),
+)
+ABCPandasArray = create_pandas_abc_type("ABCPandasArray", "_typ", ("npy_extension",))
class _ABCGeneric(type):
-
def __instancecheck__(cls, inst):
return hasattr(inst, "_data")
diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py
index 02ee777bbe7f3..9373ea18e8a24 100644
--- a/pandas/core/dtypes/inference.py
+++ b/pandas/core/dtypes/inference.py
@@ -143,10 +143,10 @@ def is_iterator(obj):
False
"""
- if not hasattr(obj, '__iter__'):
+ if not hasattr(obj, "__iter__"):
return False
- return hasattr(obj, '__next__')
+ return hasattr(obj, "__next__")
def is_file_like(obj):
@@ -180,7 +180,7 @@ def is_file_like(obj):
False
"""
- if not (hasattr(obj, 'read') or hasattr(obj, 'write')):
+ if not (hasattr(obj, "read") or hasattr(obj, "write")):
return False
if not hasattr(obj, "__iter__"):
@@ -281,15 +281,18 @@ def is_list_like(obj, allow_sets=True):
False
"""
- return (isinstance(obj, abc.Iterable) and
- # we do not count strings/unicode/bytes as list-like
- not isinstance(obj, (str, bytes)) and
-
- # exclude zero-dimensional numpy arrays, effectively scalars
- not (isinstance(obj, np.ndarray) and obj.ndim == 0) and
-
- # exclude sets if allow_sets is False
- not (allow_sets is False and isinstance(obj, abc.Set)))
+ return (
+ isinstance(obj, abc.Iterable)
+ and
+ # we do not count strings/unicode/bytes as list-like
+ not isinstance(obj, (str, bytes))
+ and
+ # exclude zero-dimensional numpy arrays, effectively scalars
+ not (isinstance(obj, np.ndarray) and obj.ndim == 0)
+ and
+ # exclude sets if allow_sets is False
+ not (allow_sets is False and isinstance(obj, abc.Set))
+ )
def is_array_like(obj):
@@ -365,8 +368,12 @@ def is_nested_list_like(obj):
--------
is_list_like
"""
- return (is_list_like(obj) and hasattr(obj, '__len__') and
- len(obj) > 0 and all(is_list_like(item) for item in obj))
+ return (
+ is_list_like(obj)
+ and hasattr(obj, "__len__")
+ and len(obj) > 0
+ and all(is_list_like(item) for item in obj)
+ )
def is_dict_like(obj):
@@ -394,9 +401,11 @@ def is_dict_like(obj):
True
"""
dict_like_attrs = ("__getitem__", "keys", "__contains__")
- return (all(hasattr(obj, attr) for attr in dict_like_attrs)
- # [GH 25196] exclude classes
- and not isinstance(obj, type))
+ return (
+ all(hasattr(obj, attr) for attr in dict_like_attrs)
+ # [GH 25196] exclude classes
+ and not isinstance(obj, type)
+ )
def is_named_tuple(obj):
@@ -423,7 +432,7 @@ def is_named_tuple(obj):
False
"""
- return isinstance(obj, tuple) and hasattr(obj, '_fields')
+ return isinstance(obj, tuple) and hasattr(obj, "_fields")
def is_hashable(obj):
@@ -489,7 +498,7 @@ def is_sequence(obj):
try:
iter(obj) # Can iterate over it.
- len(obj) # Has a length associated with it.
+ len(obj) # Has a length associated with it.
return not isinstance(obj, (str, bytes))
except (TypeError, AttributeError):
return False
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 914a292d3db97..f540e9297738a 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -8,15 +8,37 @@
from pandas._libs.tslibs import NaT, iNaT
from .common import (
- _NS_DTYPE, _TD_DTYPE, ensure_object, is_bool_dtype, is_complex_dtype,
- is_datetime64_dtype, is_datetime64tz_dtype, is_datetimelike,
- is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype,
- is_float_dtype, is_integer_dtype, is_object_dtype, is_period_dtype,
- is_scalar, is_string_dtype, is_string_like_dtype, is_timedelta64_dtype,
- needs_i8_conversion, pandas_dtype)
+ _NS_DTYPE,
+ _TD_DTYPE,
+ ensure_object,
+ is_bool_dtype,
+ is_complex_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_datetimelike,
+ is_datetimelike_v_numeric,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_float_dtype,
+ is_integer_dtype,
+ is_object_dtype,
+ is_period_dtype,
+ is_scalar,
+ is_string_dtype,
+ is_string_like_dtype,
+ is_timedelta64_dtype,
+ needs_i8_conversion,
+ pandas_dtype,
+)
from .generic import (
- ABCDatetimeArray, ABCExtensionArray, ABCGeneric, ABCIndexClass,
- ABCMultiIndex, ABCSeries, ABCTimedeltaArray)
+ ABCDatetimeArray,
+ ABCExtensionArray,
+ ABCGeneric,
+ ABCIndexClass,
+ ABCMultiIndex,
+ ABCSeries,
+ ABCTimedeltaArray,
+)
from .inference import is_list_like
isposinf_scalar = libmissing.isposinf_scalar
@@ -109,15 +131,23 @@ def _isna_new(obj):
# hack (for now) because MI registers as ndarray
elif isinstance(obj, ABCMultiIndex):
raise NotImplementedError("isna is not defined for MultiIndex")
- elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass,
- ABCExtensionArray,
- ABCDatetimeArray, ABCTimedeltaArray)):
+ elif isinstance(
+ obj,
+ (
+ ABCSeries,
+ np.ndarray,
+ ABCIndexClass,
+ ABCExtensionArray,
+ ABCDatetimeArray,
+ ABCTimedeltaArray,
+ ),
+ ):
return _isna_ndarraylike(obj)
elif isinstance(obj, ABCGeneric):
return obj._constructor(obj._data.isna(func=isna))
elif isinstance(obj, list):
return _isna_ndarraylike(np.asarray(obj, dtype=object))
- elif hasattr(obj, '__array__'):
+ elif hasattr(obj, "__array__"):
return _isna_ndarraylike(np.asarray(obj))
else:
return obj is None
@@ -145,7 +175,7 @@ def _isna_old(obj):
return obj._constructor(obj._data.isna(func=_isna_old))
elif isinstance(obj, list):
return _isna_ndarraylike_old(np.asarray(obj, dtype=object))
- elif hasattr(obj, '__array__'):
+ elif hasattr(obj, "__array__"):
return _isna_ndarraylike_old(np.asarray(obj))
else:
return obj is None
@@ -174,11 +204,12 @@ def _use_inf_as_na(key):
programmatically-creating-variables-in-python/4859312#4859312
"""
from pandas._config import get_option
+
flag = get_option(key)
if flag:
- globals()['_isna'] = _isna_old
+ globals()["_isna"] = _isna_old
else:
- globals()['_isna'] = _isna_new
+ globals()["_isna"] = _isna_new
def _isna_ndarraylike(obj):
@@ -187,7 +218,7 @@ def _isna_ndarraylike(obj):
if not is_extension:
# Avoid accessing `.values` on things like
# PeriodIndex, which may be expensive.
- values = getattr(obj, 'values', obj)
+ values = getattr(obj, "values", obj)
else:
values = obj
@@ -216,20 +247,19 @@ def _isna_ndarraylike(obj):
elif needs_i8_conversion(dtype):
# this is the NaT pattern
- result = values.view('i8') == iNaT
+ result = values.view("i8") == iNaT
else:
result = np.isnan(values)
# box
if isinstance(obj, ABCSeries):
- result = obj._constructor(
- result, index=obj.index, name=obj.name, copy=False)
+ result = obj._constructor(result, index=obj.index, name=obj.name, copy=False)
return result
def _isna_ndarraylike_old(obj):
- values = getattr(obj, 'values', obj)
+ values = getattr(obj, "values", obj)
dtype = values.dtype
if is_string_dtype(dtype):
@@ -245,14 +275,13 @@ def _isna_ndarraylike_old(obj):
elif is_datetime64_dtype(dtype):
# this is the NaT pattern
- result = values.view('i8') == iNaT
+ result = values.view("i8") == iNaT
else:
result = ~np.isfinite(values)
# box
if isinstance(obj, ABCSeries):
- result = obj._constructor(
- result, index=obj.index, name=obj.name, copy=False)
+ result = obj._constructor(result, index=obj.index, name=obj.name, copy=False)
return result
@@ -353,8 +382,7 @@ def _isna_compat(arr, fill_value=np.nan):
"""
dtype = arr.dtype
if isna(fill_value):
- return not (is_bool_dtype(dtype) or
- is_integer_dtype(dtype))
+ return not (is_bool_dtype(dtype) or is_integer_dtype(dtype))
return True
@@ -402,15 +430,15 @@ def array_equivalent(left, right, strict_nan=False):
if not strict_nan:
# isna considers NaN and None to be equivalent.
return lib.array_equivalent_object(
- ensure_object(left.ravel()), ensure_object(right.ravel()))
+ ensure_object(left.ravel()), ensure_object(right.ravel())
+ )
for left_value, right_value in zip(left, right):
if left_value is NaT and right_value is not NaT:
return False
elif isinstance(left_value, float) and np.isnan(left_value):
- if (not isinstance(right_value, float) or
- not np.isnan(right_value)):
+ if not isinstance(right_value, float) or not np.isnan(right_value):
return False
else:
if left_value != right_value:
@@ -434,12 +462,11 @@ def array_equivalent(left, right, strict_nan=False):
if not is_dtype_equal(left.dtype, right.dtype):
return False
- left = left.view('i8')
- right = right.view('i8')
+ left = left.view("i8")
+ right = right.view("i8")
# if we have structured dtypes, compare first
- if (left.dtype.type is np.void or
- right.dtype.type is np.void):
+ if left.dtype.type is np.void or right.dtype.type is np.void:
if left.dtype != right.dtype:
return False
@@ -457,13 +484,13 @@ def _infer_fill_value(val):
val = [val]
val = np.array(val, copy=False)
if is_datetimelike(val):
- return np.array('NaT', dtype=val.dtype)
+ return np.array("NaT", dtype=val.dtype)
elif is_object_dtype(val.dtype):
dtype = lib.infer_dtype(ensure_object(val), skipna=False)
- if dtype in ['datetime', 'datetime64']:
- return np.array('NaT', dtype=_NS_DTYPE)
- elif dtype in ['timedelta', 'timedelta64']:
- return np.array('NaT', dtype=_TD_DTYPE)
+ if dtype in ["datetime", "datetime64"]:
+ return np.array("NaT", dtype=_NS_DTYPE)
+ elif dtype in ["timedelta", "timedelta64"]:
+ return np.array("NaT", dtype=_TD_DTYPE)
return np.nan
@@ -506,8 +533,12 @@ def na_value_for_dtype(dtype, compat=True):
if is_extension_array_dtype(dtype):
return dtype.na_value
- if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or
- is_timedelta64_dtype(dtype) or is_period_dtype(dtype)):
+ if (
+ is_datetime64_dtype(dtype)
+ or is_datetime64tz_dtype(dtype)
+ or is_timedelta64_dtype(dtype)
+ or is_period_dtype(dtype)
+ ):
return NaT
elif is_float_dtype(dtype):
return np.nan
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 0dba7c7b5d288..a1989fd62b6ee 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -27,44 +27,89 @@
from pandas.compat import PY36, raise_with_traceback
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (
- Appender, Substitution, deprecate_kwarg, rewrite_axis_style_signature)
-from pandas.util._validators import (
- validate_axis_style_args, validate_bool_kwarg)
+ Appender,
+ Substitution,
+ deprecate_kwarg,
+ rewrite_axis_style_signature,
+)
+from pandas.util._validators import validate_axis_style_args, validate_bool_kwarg
from pandas.core.dtypes.cast import (
- cast_scalar_to_array, coerce_to_dtypes, find_common_type,
- infer_dtype_from_scalar, invalidate_string_dtypes, maybe_cast_to_datetime,
- maybe_convert_platform, maybe_downcast_to_dtype,
- maybe_infer_to_datetimelike, maybe_upcast, maybe_upcast_putmask)
+ cast_scalar_to_array,
+ coerce_to_dtypes,
+ find_common_type,
+ infer_dtype_from_scalar,
+ invalidate_string_dtypes,
+ maybe_cast_to_datetime,
+ maybe_convert_platform,
+ maybe_downcast_to_dtype,
+ maybe_infer_to_datetimelike,
+ maybe_upcast,
+ maybe_upcast_putmask,
+)
from pandas.core.dtypes.common import (
- ensure_float64, ensure_int64, ensure_platform_int, infer_dtype_from_object,
- is_bool_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype,
- is_dict_like, is_dtype_equal, is_extension_array_dtype, is_extension_type,
- is_float_dtype, is_integer, is_integer_dtype, is_iterator, is_list_like,
- is_named_tuple, is_nested_list_like, is_object_dtype, is_scalar,
- is_sequence, needs_i8_conversion)
+ ensure_float64,
+ ensure_int64,
+ ensure_platform_int,
+ infer_dtype_from_object,
+ is_bool_dtype,
+ is_datetime64_any_dtype,
+ is_datetime64tz_dtype,
+ is_dict_like,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_extension_type,
+ is_float_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_iterator,
+ is_list_like,
+ is_named_tuple,
+ is_nested_list_like,
+ is_object_dtype,
+ is_scalar,
+ is_sequence,
+ needs_i8_conversion,
+)
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCSeries)
+ ABCDataFrame,
+ ABCIndexClass,
+ ABCMultiIndex,
+ ABCSeries,
+)
from pandas.core.dtypes.missing import isna, notna
from pandas.core import algorithms, common as com, nanops, ops
from pandas.core.accessor import CachedAccessor
from pandas.core.arrays import Categorical, ExtensionArray
-from pandas.core.arrays.datetimelike import (
- DatetimeLikeArrayMixin as DatetimeLikeArray)
+from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import (
- Index, MultiIndex, ensure_index, ensure_index_from_sequences)
+ Index,
+ MultiIndex,
+ ensure_index,
+ ensure_index_from_sequences,
+)
from pandas.core.indexes import base as ibase
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.period import PeriodIndex
from pandas.core.indexing import (
- check_bool_indexer, convert_to_index_sliceable, maybe_droplevels)
+ check_bool_indexer,
+ convert_to_index_sliceable,
+ maybe_droplevels,
+)
from pandas.core.internals import BlockManager
from pandas.core.internals.construction import (
- arrays_to_mgr, get_names_from_index, init_dict, init_ndarray,
- masked_rec_array_to_mgr, reorder_arrays, sanitize_index, to_arrays)
+ arrays_to_mgr,
+ get_names_from_index,
+ init_dict,
+ init_ndarray,
+ masked_rec_array_to_mgr,
+ reorder_arrays,
+ sanitize_index,
+ to_arrays,
+)
from pandas.core.series import Series
from pandas.io.formats import console, format as fmt
@@ -75,7 +120,8 @@
# Docstring templates
_shared_doc_kwargs = dict(
- axes='index, columns', klass='DataFrame',
+ axes="index, columns",
+ klass="DataFrame",
axes_single_arg="{0 or 'index', 1 or 'columns'}",
axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
If 0 or 'index': apply function to each column.
@@ -91,7 +137,7 @@
.. versionchanged:: 0.23.0
Allow specifying index or column level names.""",
- versionadded_to_excel='',
+ versionadded_to_excel="",
optional_labels="""labels : array-like, optional
New labels / index to conform the axis specified by 'axis' to.""",
optional_axis="""axis : int or str, optional
@@ -330,9 +376,9 @@ def _constructor(self):
return DataFrame
_constructor_sliced = Series # type: Type[Series]
- _deprecations = NDFrame._deprecations | frozenset([
- 'get_value', 'set_value', 'from_items'
- ]) # type: FrozenSet[str]
+ _deprecations = NDFrame._deprecations | frozenset(
+ ["get_value", "set_value", "from_items"]
+ ) # type: FrozenSet[str]
_accessors = set() # type: Set[str]
@property
@@ -342,8 +388,7 @@ def _constructor_expanddim(self):
# ----------------------------------------------------------------------
# Constructors
- def __init__(self, data=None, index=None, columns=None, dtype=None,
- copy=False):
+ def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False):
if data is None:
data = {}
if dtype is not None:
@@ -353,16 +398,17 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
data = data._data
if isinstance(data, BlockManager):
- mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
- dtype=dtype, copy=copy)
+ mgr = self._init_mgr(
+ data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
+ )
elif isinstance(data, dict):
mgr = init_dict(data, index, columns, dtype=dtype)
elif isinstance(data, ma.MaskedArray):
import numpy.ma.mrecords as mrecords
+
# masked recarray
if isinstance(data, mrecords.MaskedRecords):
- mgr = masked_rec_array_to_mgr(data, index, columns, dtype,
- copy)
+ mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)
# a masked array
else:
@@ -373,8 +419,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
data[mask] = fill_value
else:
data = data.copy()
- mgr = init_ndarray(data, index, columns, dtype=dtype,
- copy=copy)
+ mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
elif isinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
@@ -383,20 +428,17 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
if columns is None:
columns = data_columns
mgr = init_dict(data, index, columns, dtype=dtype)
- elif getattr(data, 'name', None) is not None:
- mgr = init_dict({data.name: data}, index, columns,
- dtype=dtype)
+ elif getattr(data, "name", None) is not None:
+ mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
else:
- mgr = init_ndarray(data, index, columns, dtype=dtype,
- copy=copy)
+ mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
# For data is list-like, or Iterable (will consume into list)
- elif (isinstance(data, abc.Iterable) and
- not isinstance(data, (str, bytes))):
+ elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
if not isinstance(data, abc.Sequence):
data = list(data)
if len(data) > 0:
- if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
+ if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields
arrays, columns = to_arrays(data, columns, dtype=dtype)
@@ -411,28 +453,30 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
else:
index = ibase.default_index(len(data))
- mgr = arrays_to_mgr(arrays, columns, index, columns,
- dtype=dtype)
+ mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
else:
- mgr = init_ndarray(data, index, columns, dtype=dtype,
- copy=copy)
+ mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
else:
mgr = init_dict({}, index, columns, dtype=dtype)
else:
try:
arr = np.array(data, dtype=dtype, copy=copy)
except (ValueError, TypeError) as e:
- exc = TypeError('DataFrame constructor called with '
- 'incompatible data and dtype: {e}'.format(e=e))
+ exc = TypeError(
+ "DataFrame constructor called with "
+ "incompatible data and dtype: {e}".format(e=e)
+ )
raise_with_traceback(exc)
if arr.ndim == 0 and index is not None and columns is not None:
- values = cast_scalar_to_array((len(index), len(columns)),
- data, dtype=dtype)
- mgr = init_ndarray(values, index, columns,
- dtype=values.dtype, copy=False)
+ values = cast_scalar_to_array(
+ (len(index), len(columns)), data, dtype=dtype
+ )
+ mgr = init_ndarray(
+ values, index, columns, dtype=values.dtype, copy=False
+ )
else:
- raise ValueError('DataFrame constructor not properly called!')
+ raise ValueError("DataFrame constructor not properly called!")
NDFrame.__init__(self, mgr, fastpath=True)
@@ -533,8 +577,9 @@ def _repr_fits_horizontal_(self, ignore_width=False):
nb_columns = len(self.columns)
# exceed max columns
- if ((max_columns and nb_columns > max_columns) or
- ((not ignore_width) and width and nb_columns > (width // 2))):
+ if (max_columns and nb_columns > max_columns) or (
+ (not ignore_width) and width and nb_columns > (width // 2)
+ ):
return False
# used by repr_html under IPython notebook or scripts ignore terminal
@@ -542,8 +587,7 @@ def _repr_fits_horizontal_(self, ignore_width=False):
if ignore_width or not console.in_interactive_session():
return True
- if (get_option('display.width') is not None or
- console.in_ipython_frontend()):
+ if get_option("display.width") is not None or console.in_ipython_frontend():
# check at least the column row for excessive width
max_rows = 1
else:
@@ -560,13 +604,13 @@ def _repr_fits_horizontal_(self, ignore_width=False):
if not (max_rows is None): # unlimited rows
# min of two, where one may be None
- d = d.iloc[:min(max_rows, len(d))]
+ d = d.iloc[: min(max_rows, len(d))]
else:
return True
d.to_string(buf=buf)
value = buf.getvalue()
- repr_width = max(len(l) for l in value.split('\n'))
+ repr_width = max(len(l) for l in value.split("\n"))
return repr_width < width
@@ -574,9 +618,10 @@ def _info_repr(self):
"""
True if the repr should show the info view.
"""
- info_repr_option = (get_option("display.large_repr") == "info")
- return info_repr_option and not (self._repr_fits_horizontal_() and
- self._repr_fits_vertical_())
+ info_repr_option = get_option("display.large_repr") == "info"
+ return info_repr_option and not (
+ self._repr_fits_horizontal_() and self._repr_fits_vertical_()
+ )
def __repr__(self):
"""
@@ -595,9 +640,14 @@ def __repr__(self):
width, _ = console.get_console_size()
else:
width = None
- self.to_string(buf=buf, max_rows=max_rows, min_rows=min_rows,
- max_cols=max_cols, line_width=width,
- show_dimensions=show_dimensions)
+ self.to_string(
+ buf=buf,
+ max_rows=max_rows,
+ min_rows=min_rows,
+ max_cols=max_cols,
+ line_width=width,
+ show_dimensions=show_dimensions,
+ )
return buf.getvalue()
@@ -611,32 +661,52 @@ def _repr_html_(self):
buf = StringIO("")
self.info(buf=buf)
# need to escape the , should be the first line.
- val = buf.getvalue().replace('<', r'<', 1)
- val = val.replace('>', r'>', 1)
- return '' + val + '
'
+ val = buf.getvalue().replace("<", r"<", 1)
+ val = val.replace(">", r">", 1)
+ return "" + val + "
"
if get_option("display.notebook_repr_html"):
max_rows = get_option("display.max_rows")
max_cols = get_option("display.max_columns")
show_dimensions = get_option("display.show_dimensions")
- return self.to_html(max_rows=max_rows, max_cols=max_cols,
- show_dimensions=show_dimensions, notebook=True)
+ return self.to_html(
+ max_rows=max_rows,
+ max_cols=max_cols,
+ show_dimensions=show_dimensions,
+ notebook=True,
+ )
else:
return None
- @Substitution(header='Write out the column names. If a list of strings '
- 'is given, it is assumed to be aliases for the '
- 'column names',
- col_space_type='int',
- col_space='The minimum width of each column')
- @Substitution(shared_params=fmt.common_docstring,
- returns=fmt.return_docstring)
- def to_string(self, buf=None, columns=None, col_space=None, header=True,
- index=True, na_rep='NaN', formatters=None, float_format=None,
- sparsify=None, index_names=True, justify=None,
- max_rows=None, min_rows=None, max_cols=None,
- show_dimensions=False, decimal='.', line_width=None):
+ @Substitution(
+ header="Write out the column names. If a list of strings "
+ "is given, it is assumed to be aliases for the "
+ "column names",
+ col_space_type="int",
+ col_space="The minimum width of each column",
+ )
+ @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
+ def to_string(
+ self,
+ buf=None,
+ columns=None,
+ col_space=None,
+ header=True,
+ index=True,
+ na_rep="NaN",
+ formatters=None,
+ float_format=None,
+ sparsify=None,
+ index_names=True,
+ justify=None,
+ max_rows=None,
+ min_rows=None,
+ max_cols=None,
+ show_dimensions=False,
+ decimal=".",
+ line_width=None,
+ ):
"""
Render a DataFrame to a console-friendly tabular output.
%(shared_params)s
@@ -658,19 +728,26 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True,
2 3 6
"""
- formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
- col_space=col_space, na_rep=na_rep,
- formatters=formatters,
- float_format=float_format,
- sparsify=sparsify, justify=justify,
- index_names=index_names,
- header=header, index=index,
- min_rows=min_rows,
- max_rows=max_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=decimal,
- line_width=line_width)
+ formatter = fmt.DataFrameFormatter(
+ self,
+ buf=buf,
+ columns=columns,
+ col_space=col_space,
+ na_rep=na_rep,
+ formatters=formatters,
+ float_format=float_format,
+ sparsify=sparsify,
+ justify=justify,
+ index_names=index_names,
+ header=header,
+ index=index,
+ min_rows=min_rows,
+ max_rows=max_rows,
+ max_cols=max_cols,
+ show_dimensions=show_dimensions,
+ decimal=decimal,
+ line_width=line_width,
+ )
formatter.to_string()
if buf is None:
@@ -690,6 +767,7 @@ def style(self):
io.formats.style.Styler
"""
from pandas.io.formats.style import Styler
+
return Styler(self)
def iteritems(self):
@@ -740,7 +818,7 @@ def iteritems(self):
koala 80000
Name: population, dtype: int64
"""
- if self.columns.is_unique and hasattr(self, '_item_cache'):
+ if self.columns.is_unique and hasattr(self, "_item_cache"):
for k in self.columns:
yield k, self._get_item_cache(k)
else:
@@ -966,9 +1044,8 @@ def dot(self, other):
"""
if isinstance(other, (Series, DataFrame)):
common = self.columns.union(other.index)
- if (len(common) > len(self.columns) or
- len(common) > len(other.index)):
- raise ValueError('matrices are not aligned')
+ if len(common) > len(self.columns) or len(common) > len(other.index):
+ raise ValueError("matrices are not aligned")
left = self.reindex(columns=common, copy=False)
right = other.reindex(index=common, copy=False)
@@ -979,13 +1056,15 @@ def dot(self, other):
lvals = self.values
rvals = np.asarray(other)
if lvals.shape[1] != rvals.shape[0]:
- raise ValueError('Dot product shape mismatch, '
- '{s} vs {r}'.format(s=lvals.shape,
- r=rvals.shape))
+ raise ValueError(
+ "Dot product shape mismatch, "
+ "{s} vs {r}".format(s=lvals.shape, r=rvals.shape)
+ )
if isinstance(other, DataFrame):
- return self._constructor(np.dot(lvals, rvals), index=left.index,
- columns=other.columns)
+ return self._constructor(
+ np.dot(lvals, rvals), index=left.index, columns=other.columns
+ )
elif isinstance(other, Series):
return Series(np.dot(lvals, rvals), index=left.index)
elif isinstance(rvals, (np.ndarray, Index)):
@@ -995,7 +1074,7 @@ def dot(self, other):
else:
return Series(result, index=left.index)
else: # pragma: no cover
- raise TypeError('unsupported type: {oth}'.format(oth=type(other)))
+ raise TypeError("unsupported type: {oth}".format(oth=type(other)))
def __matmul__(self, other):
"""
@@ -1013,7 +1092,7 @@ def __rmatmul__(self, other):
# IO methods (to / from other formats)
@classmethod
- def from_dict(cls, data, orient='columns', dtype=None, columns=None):
+ def from_dict(cls, data, orient="columns", dtype=None, columns=None):
"""
Construct DataFrame from dict of array-like or dicts.
@@ -1078,19 +1157,20 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None):
"""
index = None
orient = orient.lower()
- if orient == 'index':
+ if orient == "index":
if len(data) > 0:
# TODO speed up Series case
if isinstance(list(data.values())[0], (Series, dict)):
data = _from_nested_dict(data)
else:
data, index = list(data.values()), list(data.keys())
- elif orient == 'columns':
+ elif orient == "columns":
if columns is not None:
- raise ValueError("cannot use columns parameter with "
- "orient='columns'")
+ raise ValueError(
+ "cannot use columns parameter with " "orient='columns'"
+ )
else: # pragma: no cover
- raise ValueError('only recognize index or columns for orient')
+ raise ValueError("only recognize index or columns for orient")
return cls(data, index=index, columns=columns, dtype=dtype)
@@ -1149,7 +1229,7 @@ def to_numpy(self, dtype=None, copy=False):
result = np.array(self.values, dtype=dtype, copy=copy)
return result
- def to_dict(self, orient='dict', into=dict):
+ def to_dict(self, orient="dict", into=dict):
"""
Convert the DataFrame to a dictionary.
@@ -1239,48 +1319,68 @@ def to_dict(self, orient='dict', into=dict):
defaultdict(, {'col1': 2, 'col2': 0.75})]
"""
if not self.columns.is_unique:
- warnings.warn("DataFrame columns are not unique, some "
- "columns will be omitted.", UserWarning,
- stacklevel=2)
+ warnings.warn(
+ "DataFrame columns are not unique, some " "columns will be omitted.",
+ UserWarning,
+ stacklevel=2,
+ )
# GH16122
into_c = com.standardize_mapping(into)
- if orient.lower().startswith('d'):
- return into_c(
- (k, v.to_dict(into)) for k, v in self.items())
- elif orient.lower().startswith('l'):
+ if orient.lower().startswith("d"):
+ return into_c((k, v.to_dict(into)) for k, v in self.items())
+ elif orient.lower().startswith("l"):
return into_c((k, v.tolist()) for k, v in self.items())
- elif orient.lower().startswith('sp'):
- return into_c((('index', self.index.tolist()),
- ('columns', self.columns.tolist()),
- ('data', [
- list(map(com.maybe_box_datetimelike, t))
- for t in self.itertuples(index=False, name=None)
- ])))
- elif orient.lower().startswith('s'):
- return into_c((k, com.maybe_box_datetimelike(v))
- for k, v in self.items())
- elif orient.lower().startswith('r'):
+ elif orient.lower().startswith("sp"):
+ return into_c(
+ (
+ ("index", self.index.tolist()),
+ ("columns", self.columns.tolist()),
+ (
+ "data",
+ [
+ list(map(com.maybe_box_datetimelike, t))
+ for t in self.itertuples(index=False, name=None)
+ ],
+ ),
+ )
+ )
+ elif orient.lower().startswith("s"):
+ return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items())
+ elif orient.lower().startswith("r"):
columns = self.columns.tolist()
- rows = (dict(zip(columns, row))
- for row in self.itertuples(index=False, name=None))
+ rows = (
+ dict(zip(columns, row))
+ for row in self.itertuples(index=False, name=None)
+ )
return [
- into_c((k, com.maybe_box_datetimelike(v))
- for k, v in row.items())
- for row in rows]
- elif orient.lower().startswith('i'):
+ into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items())
+ for row in rows
+ ]
+ elif orient.lower().startswith("i"):
if not self.index.is_unique:
- raise ValueError(
- "DataFrame index must be unique for orient='index'."
- )
- return into_c((t[0], dict(zip(self.columns, t[1:])))
- for t in self.itertuples(name=None))
+ raise ValueError("DataFrame index must be unique for orient='index'.")
+ return into_c(
+ (t[0], dict(zip(self.columns, t[1:])))
+ for t in self.itertuples(name=None)
+ )
else:
raise ValueError("orient '{o}' not understood".format(o=orient))
- def to_gbq(self, destination_table, project_id=None, chunksize=None,
- reauth=False, if_exists='fail', auth_local_webserver=False,
- table_schema=None, location=None, progress_bar=True,
- credentials=None, verbose=None, private_key=None):
+ def to_gbq(
+ self,
+ destination_table,
+ project_id=None,
+ chunksize=None,
+ reauth=False,
+ if_exists="fail",
+ auth_local_webserver=False,
+ table_schema=None,
+ location=None,
+ progress_bar=True,
+ credentials=None,
+ verbose=None,
+ private_key=None,
+ ):
"""
Write a DataFrame to a Google BigQuery table.
@@ -1376,16 +1476,33 @@ def to_gbq(self, destination_table, project_id=None, chunksize=None,
read_gbq : Read a DataFrame from Google BigQuery.
"""
from pandas.io import gbq
- gbq.to_gbq(self, destination_table, project_id=project_id,
- chunksize=chunksize, reauth=reauth, if_exists=if_exists,
- auth_local_webserver=auth_local_webserver,
- table_schema=table_schema, location=location,
- progress_bar=progress_bar, credentials=credentials,
- verbose=verbose, private_key=private_key)
+
+ gbq.to_gbq(
+ self,
+ destination_table,
+ project_id=project_id,
+ chunksize=chunksize,
+ reauth=reauth,
+ if_exists=if_exists,
+ auth_local_webserver=auth_local_webserver,
+ table_schema=table_schema,
+ location=location,
+ progress_bar=progress_bar,
+ credentials=credentials,
+ verbose=verbose,
+ private_key=private_key,
+ )
@classmethod
- def from_records(cls, data, index=None, exclude=None, columns=None,
- coerce_float=False, nrows=None):
+ def from_records(
+ cls,
+ data,
+ index=None,
+ exclude=None,
+ columns=None,
+ coerce_float=False,
+ nrows=None,
+ ):
"""
Convert structured or record ndarray to DataFrame.
@@ -1428,7 +1545,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
return cls(index=index, columns=columns)
dtype = None
- if hasattr(first_row, 'dtype') and first_row.dtype.names:
+ if hasattr(first_row, "dtype") and first_row.dtype.names:
dtype = first_row.dtype
values = [first_row]
@@ -1455,8 +1572,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
arr_columns.append(k)
arrays.append(v)
- arrays, arr_columns = reorder_arrays(arrays, arr_columns,
- columns)
+ arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns)
elif isinstance(data, (np.ndarray, DataFrame)):
arrays, columns = to_arrays(data, columns)
@@ -1464,8 +1580,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
columns = ensure_index(columns)
arr_columns = columns
else:
- arrays, arr_columns = to_arrays(data, columns,
- coerce_float=coerce_float)
+ arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float)
arr_columns = ensure_index(arr_columns)
if columns is not None:
@@ -1480,8 +1595,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
result_index = None
if index is not None:
- if (isinstance(index, str) or
- not hasattr(index, "__iter__")):
+ if isinstance(index, str) or not hasattr(index, "__iter__"):
i = columns.get_loc(index)
exclude.add(index)
if len(arrays) > 0:
@@ -1490,10 +1604,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
result_index = Index([], name=index)
else:
try:
- index_data = [arrays[arr_columns.get_loc(field)]
- for field in index]
- result_index = ensure_index_from_sequences(index_data,
- names=index)
+ index_data = [arrays[arr_columns.get_loc(field)] for field in index]
+ result_index = ensure_index_from_sequences(index_data, names=index)
exclude.update(index)
except Exception:
@@ -1511,8 +1623,9 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
return cls(mgr)
- def to_records(self, index=True, convert_datetime64=None,
- column_dtypes=None, index_dtypes=None):
+ def to_records(
+ self, index=True, convert_datetime64=None, column_dtypes=None, index_dtypes=None
+ ):
"""
Convert DataFrame to a NumPy record array.
@@ -1604,10 +1717,13 @@ def to_records(self, index=True, convert_datetime64=None,
"""
if convert_datetime64 is not None:
- warnings.warn("The 'convert_datetime64' parameter is "
- "deprecated and will be removed in a future "
- "version",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "The 'convert_datetime64' parameter is "
+ "deprecated and will be removed in a future "
+ "version",
+ FutureWarning,
+ stacklevel=2,
+ )
if index:
if is_datetime64_any_dtype(self.index) and convert_datetime64:
@@ -1619,8 +1735,7 @@ def to_records(self, index=True, convert_datetime64=None,
else:
ix_vals = [self.index.values]
- arrays = ix_vals + [self[c]._internal_get_values()
- for c in self.columns]
+ arrays = ix_vals + [self[c]._internal_get_values() for c in self.columns]
count = 0
index_names = list(self.index.names)
@@ -1628,13 +1743,12 @@ def to_records(self, index=True, convert_datetime64=None,
if isinstance(self.index, MultiIndex):
for i, n in enumerate(index_names):
if n is None:
- index_names[i] = 'level_%d' % count
+ index_names[i] = "level_%d" % count
count += 1
elif index_names[0] is None:
- index_names = ['index']
+ index_names = ["index"]
- names = [str(name) for name in itertools.chain(index_names,
- self.columns)]
+ names = [str(name) for name in itertools.chain(index_names, self.columns)]
else:
arrays = [self[c]._internal_get_values() for c in self.columns]
names = [str(c) for c in self.columns]
@@ -1687,18 +1801,15 @@ def to_records(self, index=True, convert_datetime64=None,
formats.append(dtype_mapping)
else:
element = "row" if i < index_len else "column"
- msg = ("Invalid dtype {dtype} specified for "
- "{element} {name}").format(dtype=dtype_mapping,
- element=element, name=name)
+ msg = (
+ "Invalid dtype {dtype} specified for " "{element} {name}"
+ ).format(dtype=dtype_mapping, element=element, name=name)
raise ValueError(msg)
- return np.rec.fromarrays(
- arrays,
- dtype={'names': names, 'formats': formats}
- )
+ return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
@classmethod
- def from_items(cls, items, columns=None, orient='columns'):
+ def from_items(cls, items, columns=None, orient="columns"):
"""
Construct a DataFrame from a list of tuples.
@@ -1730,23 +1841,28 @@ def from_items(cls, items, columns=None, orient='columns'):
DataFrame
"""
- warnings.warn("from_items is deprecated. Please use "
- "DataFrame.from_dict(dict(items), ...) instead. "
- "DataFrame.from_dict(OrderedDict(items)) may be used to "
- "preserve the key order.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "from_items is deprecated. Please use "
+ "DataFrame.from_dict(dict(items), ...) instead. "
+ "DataFrame.from_dict(OrderedDict(items)) may be used to "
+ "preserve the key order.",
+ FutureWarning,
+ stacklevel=2,
+ )
keys, values = zip(*items)
- if orient == 'columns':
+ if orient == "columns":
if columns is not None:
columns = ensure_index(columns)
idict = dict(items)
if len(idict) < len(items):
if not columns.equals(ensure_index(keys)):
- raise ValueError('With non-unique item names, passed '
- 'columns must be identical')
+ raise ValueError(
+ "With non-unique item names, passed "
+ "columns must be identical"
+ )
arrays = values
else:
arrays = [idict[k] for k in columns if k in idict]
@@ -1761,10 +1877,12 @@ def from_items(cls, items, columns=None, orient='columns'):
except ValueError:
if not is_nested_list_like(values):
- raise ValueError('The value in each (key, value) pair '
- 'must be an array, Series, or dict')
+ raise ValueError(
+ "The value in each (key, value) pair "
+ "must be an array, Series, or dict"
+ )
- elif orient == 'index':
+ elif orient == "index":
if columns is None:
raise TypeError("Must pass columns with orient='index'")
@@ -1779,8 +1897,10 @@ def from_items(cls, items, columns=None, orient='columns'):
except TypeError:
if not is_nested_list_like(values):
- raise ValueError('The value in each (key, value) pair '
- 'must be an array, Series, or dict')
+ raise ValueError(
+ "The value in each (key, value) pair "
+ "must be an array, Series, or dict"
+ )
else: # pragma: no cover
raise ValueError("'orient' must be either 'columns' or 'index'")
@@ -1790,7 +1910,7 @@ def _from_arrays(cls, arrays, columns, index, dtype=None):
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
return cls(mgr)
- def to_sparse(self, fill_value=None, kind='block'):
+ def to_sparse(self, fill_value=None, kind="block"):
"""
Convert to SparseDataFrame.
@@ -1846,21 +1966,39 @@ def to_sparse(self, fill_value=None, kind='block'):
>>> type(sdf) # doctest: +SKIP
"""
- warnings.warn("DataFrame.to_sparse is deprecated and will be removed "
- "in a future version", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "DataFrame.to_sparse is deprecated and will be removed "
+ "in a future version",
+ FutureWarning,
+ stacklevel=2,
+ )
from pandas.core.sparse.api import SparseDataFrame
+
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="SparseDataFrame")
- return SparseDataFrame(self._series, index=self.index,
- columns=self.columns, default_kind=kind,
- default_fill_value=fill_value)
+ return SparseDataFrame(
+ self._series,
+ index=self.index,
+ columns=self.columns,
+ default_kind=kind,
+ default_fill_value=fill_value,
+ )
- @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
- def to_stata(self, fname, convert_dates=None, write_index=True,
- encoding="latin-1", byteorder=None, time_stamp=None,
- data_label=None, variable_labels=None, version=114,
- convert_strl=None):
+ @deprecate_kwarg(old_arg_name="encoding", new_arg_name=None)
+ def to_stata(
+ self,
+ fname,
+ convert_dates=None,
+ write_index=True,
+ encoding="latin-1",
+ byteorder=None,
+ time_stamp=None,
+ data_label=None,
+ variable_labels=None,
+ version=114,
+ convert_strl=None,
+ ):
"""
Export DataFrame object to Stata dta format.
@@ -1943,20 +2081,29 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
"""
kwargs = {}
if version not in (114, 117):
- raise ValueError('Only formats 114 and 117 supported.')
+ raise ValueError("Only formats 114 and 117 supported.")
if version == 114:
if convert_strl is not None:
- raise ValueError('strl support is only available when using '
- 'format 117')
+ raise ValueError(
+ "strl support is only available when using " "format 117"
+ )
from pandas.io.stata import StataWriter as statawriter
else:
from pandas.io.stata import StataWriter117 as statawriter
- kwargs['convert_strl'] = convert_strl
- writer = statawriter(fname, self, convert_dates=convert_dates,
- byteorder=byteorder, time_stamp=time_stamp,
- data_label=data_label, write_index=write_index,
- variable_labels=variable_labels, **kwargs)
+ kwargs["convert_strl"] = convert_strl
+
+ writer = statawriter(
+ fname,
+ self,
+ convert_dates=convert_dates,
+ byteorder=byteorder,
+ time_stamp=time_stamp,
+ data_label=data_label,
+ write_index=write_index,
+ variable_labels=variable_labels,
+ **kwargs
+ )
writer.write_file()
def to_feather(self, fname):
@@ -1971,10 +2118,18 @@ def to_feather(self, fname):
string file path
"""
from pandas.io.feather_format import to_feather
+
to_feather(self, fname)
- def to_parquet(self, fname, engine='auto', compression='snappy',
- index=None, partition_cols=None, **kwargs):
+ def to_parquet(
+ self,
+ fname,
+ engine="auto",
+ compression="snappy",
+ index=None,
+ partition_cols=None,
+ **kwargs
+ ):
"""
Write a DataFrame to the binary parquet format.
@@ -2041,24 +2196,51 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
1 2 4
"""
from pandas.io.parquet import to_parquet
- to_parquet(self, fname, engine,
- compression=compression, index=index,
- partition_cols=partition_cols, **kwargs)
-
- @Substitution(header='Whether to print column labels, default True',
- col_space_type='str or int',
- col_space='The minimum width of each column in CSS length '
- 'units. An int is assumed to be px units.\n\n'
- ' .. versionadded:: 0.25.0\n'
- ' Ability to use str')
- @Substitution(shared_params=fmt.common_docstring,
- returns=fmt.return_docstring)
- def to_html(self, buf=None, columns=None, col_space=None, header=True,
- index=True, na_rep='NaN', formatters=None, float_format=None,
- sparsify=None, index_names=True, justify=None, max_rows=None,
- max_cols=None, show_dimensions=False, decimal='.',
- bold_rows=True, classes=None, escape=True, notebook=False,
- border=None, table_id=None, render_links=False):
+
+ to_parquet(
+ self,
+ fname,
+ engine,
+ compression=compression,
+ index=index,
+ partition_cols=partition_cols,
+ **kwargs
+ )
+
+ @Substitution(
+ header="Whether to print column labels, default True",
+ col_space_type="str or int",
+ col_space="The minimum width of each column in CSS length "
+ "units. An int is assumed to be px units.\n\n"
+ " .. versionadded:: 0.25.0\n"
+ " Ability to use str",
+ )
+ @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
+ def to_html(
+ self,
+ buf=None,
+ columns=None,
+ col_space=None,
+ header=True,
+ index=True,
+ na_rep="NaN",
+ formatters=None,
+ float_format=None,
+ sparsify=None,
+ index_names=True,
+ justify=None,
+ max_rows=None,
+ max_cols=None,
+ show_dimensions=False,
+ decimal=".",
+ bold_rows=True,
+ classes=None,
+ escape=True,
+ notebook=False,
+ border=None,
+ table_id=None,
+ render_links=False,
+ ):
"""
Render a DataFrame as an HTML table.
%(shared_params)s
@@ -2091,23 +2273,31 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True,
to_string : Convert DataFrame to a string.
"""
- if (justify is not None and
- justify not in fmt._VALID_JUSTIFY_PARAMETERS):
+ if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:
raise ValueError("Invalid value for justify parameter")
- formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
- col_space=col_space, na_rep=na_rep,
- formatters=formatters,
- float_format=float_format,
- sparsify=sparsify, justify=justify,
- index_names=index_names,
- header=header, index=index,
- bold_rows=bold_rows, escape=escape,
- max_rows=max_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=decimal, table_id=table_id,
- render_links=render_links)
+ formatter = fmt.DataFrameFormatter(
+ self,
+ buf=buf,
+ columns=columns,
+ col_space=col_space,
+ na_rep=na_rep,
+ formatters=formatters,
+ float_format=float_format,
+ sparsify=sparsify,
+ justify=justify,
+ index_names=index_names,
+ header=header,
+ index=index,
+ bold_rows=bold_rows,
+ escape=escape,
+ max_rows=max_rows,
+ max_cols=max_cols,
+ show_dimensions=show_dimensions,
+ decimal=decimal,
+ table_id=table_id,
+ render_links=render_links,
+ )
# TODO: a generic formatter wld b in DataFrameFormatter
formatter.to_html(classes=classes, notebook=notebook, border=border)
@@ -2116,8 +2306,9 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True,
# ----------------------------------------------------------------------
- def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
- null_counts=None):
+ def info(
+ self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None
+ ):
"""
Print a concise summary of a DataFrame.
@@ -2257,7 +2448,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
lines.append(self.index._summary())
if len(self.columns) == 0:
- lines.append('Empty {name}'.format(name=type(self).__name__))
+ lines.append("Empty {name}".format(name=type(self).__name__))
fmt.buffer_put_lines(buf, lines)
return
@@ -2265,21 +2456,18 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
# hack
if max_cols is None:
- max_cols = get_option('display.max_info_columns',
- len(self.columns) + 1)
+ max_cols = get_option("display.max_info_columns", len(self.columns) + 1)
- max_rows = get_option('display.max_info_rows', len(self) + 1)
+ max_rows = get_option("display.max_info_rows", len(self) + 1)
if null_counts is None:
- show_counts = ((len(self.columns) <= max_cols) and
- (len(self) < max_rows))
+ show_counts = (len(self.columns) <= max_cols) and (len(self) < max_rows)
else:
show_counts = null_counts
exceeds_info_cols = len(self.columns) > max_cols
def _verbose_repr():
- lines.append('Data columns (total %d columns):' %
- len(self.columns))
+ lines.append("Data columns (total %d columns):" % len(self.columns))
space = max(len(pprint_thing(k)) for k in self.columns) + 4
counts = None
@@ -2288,9 +2476,11 @@ def _verbose_repr():
counts = self.count()
if len(cols) != len(counts): # pragma: no cover
raise AssertionError(
- 'Columns must equal counts '
- '({cols:d} != {counts:d})'.format(
- cols=len(cols), counts=len(counts)))
+ "Columns must equal counts "
+ "({cols:d} != {counts:d})".format(
+ cols=len(cols), counts=len(counts)
+ )
+ )
tmpl = "{count} non-null {dtype}"
dtypes = self.dtypes
@@ -2302,22 +2492,24 @@ def _verbose_repr():
if show_counts:
count = counts.iloc[i]
- lines.append(_put_str(col, space) + tmpl.format(count=count,
- dtype=dtype))
+ lines.append(
+ _put_str(col, space) + tmpl.format(count=count, dtype=dtype)
+ )
def _non_verbose_repr():
- lines.append(self.columns._summary(name='Columns'))
+ lines.append(self.columns._summary(name="Columns"))
def _sizeof_fmt(num, size_qualifier):
# returns size in human readable format
- for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
+ for x in ["bytes", "KB", "MB", "GB", "TB"]:
if num < 1024.0:
- return ("{num:3.1f}{size_q} "
- "{x}".format(num=num, size_q=size_qualifier, x=x))
+ return "{num:3.1f}{size_q} " "{x}".format(
+ num=num, size_q=size_qualifier, x=x
+ )
num /= 1024.0
- return "{num:3.1f}{size_q} {pb}".format(num=num,
- size_q=size_qualifier,
- pb='PB')
+ return "{num:3.1f}{size_q} {pb}".format(
+ num=num, size_q=size_qualifier, pb="PB"
+ )
if verbose:
_verbose_repr()
@@ -2330,28 +2522,29 @@ def _sizeof_fmt(num, size_qualifier):
_verbose_repr()
counts = self._data.get_dtype_counts()
- dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
- in sorted(counts.items())]
- lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
+ dtypes = ["{k}({kk:d})".format(k=k[0], kk=k[1]) for k in sorted(counts.items())]
+ lines.append("dtypes: {types}".format(types=", ".join(dtypes)))
if memory_usage is None:
- memory_usage = get_option('display.memory_usage')
+ memory_usage = get_option("display.memory_usage")
if memory_usage:
# append memory usage of df to display
- size_qualifier = ''
- if memory_usage == 'deep':
+ size_qualifier = ""
+ if memory_usage == "deep":
deep = True
else:
# size_qualifier is just a best effort; not guaranteed to catch
# all cases (e.g., it misses categorical data even with object
# categories)
deep = False
- if ('object' in counts or
- self.index._is_memory_usage_qualified()):
- size_qualifier = '+'
+ if "object" in counts or self.index._is_memory_usage_qualified():
+ size_qualifier = "+"
mem_usage = self.memory_usage(index=True, deep=deep).sum()
- lines.append("memory usage: {mem}\n".format(
- mem=_sizeof_fmt(mem_usage, size_qualifier)))
+ lines.append(
+ "memory usage: {mem}\n".format(
+ mem=_sizeof_fmt(mem_usage, size_qualifier)
+ )
+ )
fmt.buffer_put_lines(buf, lines)
@@ -2439,11 +2632,14 @@ def memory_usage(self, index=True, deep=False):
>>> df['object'].astype('category').memory_usage(deep=True)
5216
"""
- result = Series([c.memory_usage(index=False, deep=deep)
- for col, c in self.iteritems()], index=self.columns)
+ result = Series(
+ [c.memory_usage(index=False, deep=deep) for col, c in self.iteritems()],
+ index=self.columns,
+ )
if index:
- result = Series(self.index.memory_usage(deep=deep),
- index=['Index']).append(result)
+ result = Series(self.index.memory_usage(deep=deep), index=["Index"]).append(
+ result
+ )
return result
def transpose(self, *args, **kwargs):
@@ -2566,13 +2762,13 @@ def _unpickle_matrix_compat(self, state): # pragma: no cover
(vals, idx, cols), object_state = state
index = com._unpickle_array(idx)
- dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols),
- copy=False)
+ dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols), copy=False)
if object_state is not None:
ovals, _, ocols = object_state
- objects = DataFrame(ovals, index=index,
- columns=com._unpickle_array(ocols), copy=False)
+ objects = DataFrame(
+ ovals, index=index, columns=com._unpickle_array(ocols), copy=False
+ )
dm = dm.join(objects)
@@ -2599,10 +2795,13 @@ def get_value(self, index, col, takeable=False):
scalar
"""
- warnings.warn("get_value is deprecated and will be removed "
- "in a future release. Please use "
- ".at[] or .iat[] accessors instead", FutureWarning,
- stacklevel=2)
+ warnings.warn(
+ "get_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._get_value(index, col, takeable=takeable)
def _get_value(self, index, col, takeable=False):
@@ -2629,6 +2828,7 @@ def _get_value(self, index, col, takeable=False):
col = self.columns.get_loc(col)
index = self.index.get_loc(index)
return self._get_value(index, col, takeable=True)
+
_get_value.__doc__ = get_value.__doc__
def set_value(self, index, col, value, takeable=False):
@@ -2651,10 +2851,13 @@ def set_value(self, index, col, value, takeable=False):
If label pair is contained, will be reference to calling DataFrame,
otherwise a new object.
"""
- warnings.warn("set_value is deprecated and will be removed "
- "in a future release. Please use "
- ".at[] or .iat[] accessors instead", FutureWarning,
- stacklevel=2)
+ warnings.warn(
+ "set_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._set_value(index, col, value, takeable=takeable)
def _set_value(self, index, col, value, takeable=False):
@@ -2677,6 +2880,7 @@ def _set_value(self, index, col, value, takeable=False):
self._item_cache.pop(col, None)
return self
+
_set_value.__doc__ = set_value.__doc__
def _ixs(self, i, axis=0):
@@ -2706,12 +2910,15 @@ def _ixs(self, i, axis=0):
return new_values
# if we are a copy, mark as such
- copy = (isinstance(new_values, np.ndarray) and
- new_values.base is None)
- result = self._constructor_sliced(new_values,
- index=self.columns,
- name=self.index[i],
- dtype=new_values.dtype)
+ copy = (
+ isinstance(new_values, np.ndarray) and new_values.base is None
+ )
+ result = self._constructor_sliced(
+ new_values,
+ index=self.columns,
+ name=self.index[i],
+ dtype=new_values.dtype,
+ )
result._set_is_copy(self, copy=copy)
return result
@@ -2784,8 +2991,7 @@ def __getitem__(self, key):
else:
if is_iterator(key):
key = list(key)
- indexer = self.loc._convert_to_indexer(key, axis=1,
- raise_missing=True)
+ indexer = self.loc._convert_to_indexer(key, axis=1, raise_missing=True)
# take() does not accept boolean indexers
if getattr(indexer, "dtype", None) == bool:
@@ -2810,11 +3016,15 @@ def _getitem_bool_array(self, key):
# go with the __setitem__ behavior since that is more consistent
# with all other indexing behavior
if isinstance(key, Series) and not key.index.equals(self.index):
- warnings.warn("Boolean Series key will be reindexed to match "
- "DataFrame index.", UserWarning, stacklevel=3)
+ warnings.warn(
+ "Boolean Series key will be reindexed to match " "DataFrame index.",
+ UserWarning,
+ stacklevel=3,
+ )
elif len(key) != len(self.index):
- raise ValueError('Item wrong length %d instead of %d.' %
- (len(key), len(self.index)))
+ raise ValueError(
+ "Item wrong length %d instead of %d." % (len(key), len(self.index))
+ )
# check_bool_indexer will throw exception if Series key cannot
# be reindexed to match DataFrame rows
@@ -2832,8 +3042,9 @@ def _getitem_multilevel(self, key):
result.columns = result_columns
else:
new_values = self.values[:, loc]
- result = self._constructor(new_values, index=self.index,
- columns=result_columns)
+ result = self._constructor(
+ new_values, index=self.index, columns=result_columns
+ )
result = result.__finalize__(self)
# If there is only one column being returned, and its name is
@@ -2846,12 +3057,12 @@ def _getitem_multilevel(self, key):
top = result.columns[0]
if isinstance(top, tuple):
top = top[0]
- if top == '':
- result = result['']
+ if top == "":
+ result = result[""]
if isinstance(result, Series):
- result = self._constructor_sliced(result,
- index=self.index,
- name=key)
+ result = self._constructor_sliced(
+ result, index=self.index, name=key
+ )
result._set_is_copy(self)
return result
@@ -2860,7 +3071,7 @@ def _getitem_multilevel(self, key):
def _getitem_frame(self, key):
if key.values.size and not is_bool_dtype(key.values):
- raise ValueError('Must pass DataFrame with boolean values only')
+ raise ValueError("Must pass DataFrame with boolean values only")
return self.where(key)
def query(self, expr, inplace=False, **kwargs):
@@ -2972,12 +3183,12 @@ def query(self, expr, inplace=False, **kwargs):
A B C C
0 1 10 10
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if not isinstance(expr, str):
msg = "expr must be a string to be evaluated, {0} given"
raise ValueError(msg.format(type(expr)))
- kwargs['level'] = kwargs.pop('level', 0) + 1
- kwargs['target'] = None
+ kwargs["level"] = kwargs.pop("level", 0) + 1
+ kwargs["target"] = None
res = self.eval(expr, **kwargs)
try:
@@ -3084,17 +3295,16 @@ def eval(self, expr, inplace=False, **kwargs):
"""
from pandas.core.computation.eval import eval as _eval
- inplace = validate_bool_kwarg(inplace, 'inplace')
- resolvers = kwargs.pop('resolvers', None)
- kwargs['level'] = kwargs.pop('level', 0) + 1
+ inplace = validate_bool_kwarg(inplace, "inplace")
+ resolvers = kwargs.pop("resolvers", None)
+ kwargs["level"] = kwargs.pop("level", 0) + 1
if resolvers is None:
index_resolvers = self._get_index_resolvers()
- column_resolvers = \
- self._get_space_character_free_column_resolvers()
+ column_resolvers = self._get_space_character_free_column_resolvers()
resolvers = column_resolvers, index_resolvers
- if 'target' not in kwargs:
- kwargs['target'] = self
- kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
+ if "target" not in kwargs:
+ kwargs["target"] = self
+ kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
return _eval(expr, inplace=inplace, **kwargs)
def select_dtypes(self, include=None, exclude=None):
@@ -3176,10 +3386,11 @@ def select_dtypes(self, include=None, exclude=None):
4 True 1.0
5 False 2.0
"""
+
def _get_info_slice(obj, indexer):
"""Slice the info axis of `obj` with `indexer`."""
- if not hasattr(obj, '_info_axis_number'):
- msg = 'object of type {typ!r} has no info axis'
+ if not hasattr(obj, "_info_axis_number"):
+ msg = "object of type {typ!r} has no info axis"
raise TypeError(msg.format(typ=type(obj).__name__))
slices = [slice(None)] * obj.ndim
slices[obj._info_axis_number] = indexer
@@ -3193,19 +3404,22 @@ def _get_info_slice(obj, indexer):
selection = tuple(map(frozenset, (include, exclude)))
if not any(selection):
- raise ValueError('at least one of include or exclude must be '
- 'nonempty')
+ raise ValueError("at least one of include or exclude must be " "nonempty")
# convert the myriad valid dtypes object to a single representation
include, exclude = map(
- lambda x: frozenset(map(infer_dtype_from_object, x)), selection)
+ lambda x: frozenset(map(infer_dtype_from_object, x)), selection
+ )
for dtypes in (include, exclude):
invalidate_string_dtypes(dtypes)
# can't both include AND exclude!
if not include.isdisjoint(exclude):
- raise ValueError('include and exclude overlap on {inc_ex}'.format(
- inc_ex=(include & exclude)))
+ raise ValueError(
+ "include and exclude overlap on {inc_ex}".format(
+ inc_ex=(include & exclude)
+ )
+ )
# empty include/exclude -> defaults to True
# three cases (we've already raised if both are empty)
@@ -3224,8 +3438,9 @@ def _get_info_slice(obj, indexer):
def is_dtype_instance_mapper(idx, dtype):
return idx, functools.partial(issubclass, dtype.type)
- for idx, f in itertools.starmap(is_dtype_instance_mapper,
- enumerate(self.dtypes)):
+ for idx, f in itertools.starmap(
+ is_dtype_instance_mapper, enumerate(self.dtypes)
+ ):
if include: # checks for the case of empty include or exclude
include_these.iloc[idx] = any(map(f, include))
if exclude:
@@ -3256,7 +3471,7 @@ def __setitem__(self, key, value):
if indexer is not None:
return self._setitem_slice(indexer, value)
- if isinstance(key, DataFrame) or getattr(key, 'ndim', None) == 2:
+ if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
self._setitem_frame(key, value)
elif isinstance(key, (Series, np.ndarray, list, Index)):
self._setitem_array(key, value)
@@ -3272,8 +3487,9 @@ def _setitem_array(self, key, value):
# also raises Exception if object array with NA values
if com.is_bool_indexer(key):
if len(key) != len(self.index):
- raise ValueError('Item wrong length %d instead of %d!' %
- (len(key), len(self.index)))
+ raise ValueError(
+ "Item wrong length %d instead of %d!" % (len(key), len(self.index))
+ )
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
self._check_setitem_copy()
@@ -3281,7 +3497,7 @@ def _setitem_array(self, key, value):
else:
if isinstance(value, DataFrame):
if len(value.columns) != len(key):
- raise ValueError('Columns must be same length as key')
+ raise ValueError("Columns must be same length as key")
for k1, k2 in zip(key, value.columns):
self[k1] = value[k2]
else:
@@ -3294,14 +3510,12 @@ def _setitem_frame(self, key, value):
# df[df > df2] = 0
if isinstance(key, np.ndarray):
if key.shape != self.shape:
- raise ValueError(
- 'Array conditional must be same shape as self'
- )
+ raise ValueError("Array conditional must be same shape as self")
key = self._constructor(key, **self._construct_axes_dict())
if key.values.size and not is_bool_dtype(key.values):
raise TypeError(
- 'Must pass DataFrame or 2-d ndarray with boolean values only'
+ "Must pass DataFrame or 2-d ndarray with boolean values only"
)
self._check_inplace_setting(value)
@@ -3318,12 +3532,15 @@ def _ensure_valid_index(self, value):
try:
value = Series(value)
except (ValueError, NotImplementedError, TypeError):
- raise ValueError('Cannot set a frame with no defined index '
- 'and a value that cannot be converted to a '
- 'Series')
+ raise ValueError(
+ "Cannot set a frame with no defined index "
+ "and a value that cannot be converted to a "
+ "Series"
+ )
- self._data = self._data.reindex_axis(value.index.copy(), axis=1,
- fill_value=np.nan)
+ self._data = self._data.reindex_axis(
+ value.index.copy(), axis=1, fill_value=np.nan
+ )
def _set_item(self, key, value):
"""
@@ -3364,8 +3581,7 @@ def insert(self, loc, column, value, allow_duplicates=False):
"""
self._ensure_valid_index(value)
value = self._sanitize_column(column, value, broadcast=False)
- self._data.insert(loc, column, value,
- allow_duplicates=allow_duplicates)
+ self._data.insert(loc, column, value, allow_duplicates=allow_duplicates)
def assign(self, **kwargs):
r"""
@@ -3494,8 +3710,9 @@ def reindexer(value):
raise e
# other
- raise TypeError('incompatible index of inserted column '
- 'with frame index')
+ raise TypeError(
+ "incompatible index of inserted column " "with frame index"
+ )
return value
if isinstance(value, Series):
@@ -3541,8 +3758,7 @@ def reindexer(value):
else:
# cast ignores pandas dtypes. so save the dtype first
- infer_dtype, _ = infer_dtype_from_scalar(
- value, pandas_dtype=True)
+ infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True)
# upcast
value = cast_scalar_to_array(len(self.index), value)
@@ -3554,8 +3770,7 @@ def reindexer(value):
# broadcast across multiple columns if necessary
if broadcast and key in self.columns and value.ndim == 1:
- if (not self.columns.is_unique or
- isinstance(self.columns, MultiIndex)):
+ if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
existing_piece = self[key]
if isinstance(existing_piece, DataFrame):
value = np.tile(value, (len(existing_piece.columns), 1))
@@ -3564,8 +3779,10 @@ def reindexer(value):
@property
def _series(self):
- return {item: Series(self._data.iget(idx), index=self.index, name=item)
- for idx, item in enumerate(self.columns)}
+ return {
+ item: Series(self._data.iget(idx), index=self.index, name=item)
+ for idx, item in enumerate(self.columns)
+ }
def lookup(self, row_labels, col_labels):
"""
@@ -3599,7 +3816,7 @@ def lookup(self, row_labels, col_labels):
"""
n = len(row_labels)
if n != len(col_labels):
- raise ValueError('Row labels must have same size as column labels')
+ raise ValueError("Row labels must have same size as column labels")
thresh = 1000
if not self._is_mixed_type or n > thresh:
@@ -3607,13 +3824,13 @@ def lookup(self, row_labels, col_labels):
ridx = self.index.get_indexer(row_labels)
cidx = self.columns.get_indexer(col_labels)
if (ridx == -1).any():
- raise KeyError('One or more row labels was not found')
+ raise KeyError("One or more row labels was not found")
if (cidx == -1).any():
- raise KeyError('One or more column labels was not found')
+ raise KeyError("One or more column labels was not found")
flat_index = ridx * len(self.columns) + cidx
result = values.flat[flat_index]
else:
- result = np.empty(n, dtype='O')
+ result = np.empty(n, dtype="O")
for i, (r, c) in enumerate(zip(row_labels, col_labels)):
result[i] = self._get_value(r, c)
@@ -3625,88 +3842,142 @@ def lookup(self, row_labels, col_labels):
# ----------------------------------------------------------------------
# Reindexing and alignment
- def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
- copy):
+ def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
frame = self
- columns = axes['columns']
+ columns = axes["columns"]
if columns is not None:
- frame = frame._reindex_columns(columns, method, copy, level,
- fill_value, limit, tolerance)
+ frame = frame._reindex_columns(
+ columns, method, copy, level, fill_value, limit, tolerance
+ )
- index = axes['index']
+ index = axes["index"]
if index is not None:
- frame = frame._reindex_index(index, method, copy, level,
- fill_value, limit, tolerance)
+ frame = frame._reindex_index(
+ index, method, copy, level, fill_value, limit, tolerance
+ )
return frame
- def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan,
- limit=None, tolerance=None):
- new_index, indexer = self.index.reindex(new_index, method=method,
- level=level, limit=limit,
- tolerance=tolerance)
- return self._reindex_with_indexers({0: [new_index, indexer]},
- copy=copy, fill_value=fill_value,
- allow_dups=False)
-
- def _reindex_columns(self, new_columns, method, copy, level,
- fill_value=None, limit=None, tolerance=None):
- new_columns, indexer = self.columns.reindex(new_columns, method=method,
- level=level, limit=limit,
- tolerance=tolerance)
- return self._reindex_with_indexers({1: [new_columns, indexer]},
- copy=copy, fill_value=fill_value,
- allow_dups=False)
+ def _reindex_index(
+ self,
+ new_index,
+ method,
+ copy,
+ level,
+ fill_value=np.nan,
+ limit=None,
+ tolerance=None,
+ ):
+ new_index, indexer = self.index.reindex(
+ new_index, method=method, level=level, limit=limit, tolerance=tolerance
+ )
+ return self._reindex_with_indexers(
+ {0: [new_index, indexer]},
+ copy=copy,
+ fill_value=fill_value,
+ allow_dups=False,
+ )
+
+ def _reindex_columns(
+ self,
+ new_columns,
+ method,
+ copy,
+ level,
+ fill_value=None,
+ limit=None,
+ tolerance=None,
+ ):
+ new_columns, indexer = self.columns.reindex(
+ new_columns, method=method, level=level, limit=limit, tolerance=tolerance
+ )
+ return self._reindex_with_indexers(
+ {1: [new_columns, indexer]},
+ copy=copy,
+ fill_value=fill_value,
+ allow_dups=False,
+ )
def _reindex_multi(self, axes, copy, fill_value):
"""
We are guaranteed non-Nones in the axes.
"""
- new_index, row_indexer = self.index.reindex(axes['index'])
- new_columns, col_indexer = self.columns.reindex(axes['columns'])
+ new_index, row_indexer = self.index.reindex(axes["index"])
+ new_columns, col_indexer = self.columns.reindex(axes["columns"])
if row_indexer is not None and col_indexer is not None:
indexer = row_indexer, col_indexer
- new_values = algorithms.take_2d_multi(self.values, indexer,
- fill_value=fill_value)
- return self._constructor(new_values, index=new_index,
- columns=new_columns)
+ new_values = algorithms.take_2d_multi(
+ self.values, indexer, fill_value=fill_value
+ )
+ return self._constructor(new_values, index=new_index, columns=new_columns)
else:
- return self._reindex_with_indexers({0: [new_index, row_indexer],
- 1: [new_columns, col_indexer]},
- copy=copy,
- fill_value=fill_value)
-
- @Appender(_shared_docs['align'] % _shared_doc_kwargs)
- def align(self, other, join='outer', axis=None, level=None, copy=True,
- fill_value=None, method=None, limit=None, fill_axis=0,
- broadcast_axis=None):
- return super().align(other, join=join, axis=axis, level=level,
- copy=copy, fill_value=fill_value, method=method,
- limit=limit, fill_axis=fill_axis,
- broadcast_axis=broadcast_axis)
+ return self._reindex_with_indexers(
+ {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
+ copy=copy,
+ fill_value=fill_value,
+ )
+
+ @Appender(_shared_docs["align"] % _shared_doc_kwargs)
+ def align(
+ self,
+ other,
+ join="outer",
+ axis=None,
+ level=None,
+ copy=True,
+ fill_value=None,
+ method=None,
+ limit=None,
+ fill_axis=0,
+ broadcast_axis=None,
+ ):
+ return super().align(
+ other,
+ join=join,
+ axis=axis,
+ level=level,
+ copy=copy,
+ fill_value=fill_value,
+ method=method,
+ limit=limit,
+ fill_axis=fill_axis,
+ broadcast_axis=broadcast_axis,
+ )
@Substitution(**_shared_doc_kwargs)
@Appender(NDFrame.reindex.__doc__)
- @rewrite_axis_style_signature('labels', [('method', None),
- ('copy', True),
- ('level', None),
- ('fill_value', np.nan),
- ('limit', None),
- ('tolerance', None)])
+ @rewrite_axis_style_signature(
+ "labels",
+ [
+ ("method", None),
+ ("copy", True),
+ ("level", None),
+ ("fill_value", np.nan),
+ ("limit", None),
+ ("tolerance", None),
+ ],
+ )
def reindex(self, *args, **kwargs):
- axes = validate_axis_style_args(self, args, kwargs, 'labels',
- 'reindex')
+ axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex")
kwargs.update(axes)
# Pop these, since the values are in `kwargs` under different names
- kwargs.pop('axis', None)
- kwargs.pop('labels', None)
+ kwargs.pop("axis", None)
+ kwargs.pop("labels", None)
return super().reindex(**kwargs)
- def drop(self, labels=None, axis=0, index=None, columns=None,
- level=None, inplace=False, errors='raise'):
+ def drop(
+ self,
+ labels=None,
+ axis=0,
+ index=None,
+ columns=None,
+ level=None,
+ inplace=False,
+ errors="raise",
+ ):
"""
Drop specified labels from rows or columns.
@@ -3829,14 +4100,20 @@ def drop(self, labels=None, axis=0, index=None, columns=None,
falcon speed 320.0 250.0
weight 1.0 0.8
"""
- return super().drop(labels=labels, axis=axis, index=index,
- columns=columns, level=level, inplace=inplace,
- errors=errors)
+ return super().drop(
+ labels=labels,
+ axis=axis,
+ index=index,
+ columns=columns,
+ level=level,
+ inplace=inplace,
+ errors=errors,
+ )
- @rewrite_axis_style_signature('mapper', [('copy', True),
- ('inplace', False),
- ('level', None),
- ('errors', 'ignore')])
+ @rewrite_axis_style_signature(
+ "mapper",
+ [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")],
+ )
def rename(self, *args, **kwargs):
"""
Alter axes labels.
@@ -3946,35 +4223,63 @@ def rename(self, *args, **kwargs):
2 2 5
4 3 6
"""
- axes = validate_axis_style_args(self, args, kwargs, 'mapper', 'rename')
+ axes = validate_axis_style_args(self, args, kwargs, "mapper", "rename")
kwargs.update(axes)
# Pop these, since the values are in `kwargs` under different names
- kwargs.pop('axis', None)
- kwargs.pop('mapper', None)
+ kwargs.pop("axis", None)
+ kwargs.pop("mapper", None)
return super().rename(**kwargs)
@Substitution(**_shared_doc_kwargs)
@Appender(NDFrame.fillna.__doc__)
- def fillna(self, value=None, method=None, axis=None, inplace=False,
- limit=None, downcast=None, **kwargs):
- return super().fillna(value=value, method=method, axis=axis,
- inplace=inplace, limit=limit, downcast=downcast,
- **kwargs)
-
- @Appender(_shared_docs['replace'] % _shared_doc_kwargs)
- def replace(self, to_replace=None, value=None, inplace=False, limit=None,
- regex=False, method='pad'):
- return super().replace(to_replace=to_replace, value=value,
- inplace=inplace, limit=limit, regex=regex,
- method=method)
-
- @Appender(_shared_docs['shift'] % _shared_doc_kwargs)
+ def fillna(
+ self,
+ value=None,
+ method=None,
+ axis=None,
+ inplace=False,
+ limit=None,
+ downcast=None,
+ **kwargs
+ ):
+ return super().fillna(
+ value=value,
+ method=method,
+ axis=axis,
+ inplace=inplace,
+ limit=limit,
+ downcast=downcast,
+ **kwargs
+ )
+
+ @Appender(_shared_docs["replace"] % _shared_doc_kwargs)
+ def replace(
+ self,
+ to_replace=None,
+ value=None,
+ inplace=False,
+ limit=None,
+ regex=False,
+ method="pad",
+ ):
+ return super().replace(
+ to_replace=to_replace,
+ value=value,
+ inplace=inplace,
+ limit=limit,
+ regex=regex,
+ method=method,
+ )
+
+ @Appender(_shared_docs["shift"] % _shared_doc_kwargs)
def shift(self, periods=1, freq=None, axis=0, fill_value=None):
- return super().shift(periods=periods, freq=freq, axis=axis,
- fill_value=fill_value)
+ return super().shift(
+ periods=periods, freq=freq, axis=axis, fill_value=fill_value
+ )
- def set_index(self, keys, drop=True, append=False, inplace=False,
- verify_integrity=False):
+ def set_index(
+ self, keys, drop=True, append=False, inplace=False, verify_integrity=False
+ ):
"""
Set the DataFrame index using existing columns.
@@ -4064,35 +4369,39 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
3 9 7 2013 84
4 16 10 2014 31
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if not isinstance(keys, list):
keys = [keys]
- err_msg = ('The parameter "keys" may be a column key, one-dimensional '
- 'array, or a list containing only valid column keys and '
- 'one-dimensional arrays.')
+ err_msg = (
+ 'The parameter "keys" may be a column key, one-dimensional '
+ "array, or a list containing only valid column keys and "
+ "one-dimensional arrays."
+ )
missing = []
for col in keys:
- if isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray,
- list, abc.Iterator)):
+ if isinstance(
+ col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator)
+ ):
# arrays are fine as long as they are one-dimensional
# iterators get converted to list below
- if getattr(col, 'ndim', 1) != 1:
+ if getattr(col, "ndim", 1) != 1:
raise ValueError(err_msg)
else:
# everything else gets tried as a key; see GH 24969
try:
found = col in self.columns
except TypeError:
- raise TypeError(err_msg + ' Received column of '
- 'type {}'.format(type(col)))
+ raise TypeError(
+ err_msg + " Received column of " "type {}".format(type(col))
+ )
else:
if not found:
missing.append(col)
if missing:
- raise KeyError('None of {} are in the columns'.format(missing))
+ raise KeyError("None of {} are in the columns".format(missing))
if inplace:
frame = self
@@ -4135,18 +4444,18 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
if len(arrays[-1]) != len(self):
# check newest element against length of calling frame, since
# ensure_index_from_sequences would not raise for append=False.
- raise ValueError('Length mismatch: Expected {len_self} rows, '
- 'received array of length {len_col}'.format(
- len_self=len(self),
- len_col=len(arrays[-1])
- ))
+ raise ValueError(
+ "Length mismatch: Expected {len_self} rows, "
+ "received array of length {len_col}".format(
+ len_self=len(self), len_col=len(arrays[-1])
+ )
+ )
index = ensure_index_from_sequences(arrays, names)
if verify_integrity and not index.is_unique:
duplicates = index[index.duplicated()].unique()
- raise ValueError('Index has duplicate keys: {dup}'.format(
- dup=duplicates))
+ raise ValueError("Index has duplicate keys: {dup}".format(dup=duplicates))
# use set to handle duplicate column names gracefully in case of drop
for c in set(to_remove):
@@ -4160,8 +4469,9 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
if not inplace:
return frame
- def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
- col_fill=''):
+ def reset_index(
+ self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
+ ):
"""
Reset the index, or a level of it.
@@ -4303,7 +4613,7 @@ class max type
lion mammal 80.5 run
monkey mammal NaN jump
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if inplace:
new_obj = self
else:
@@ -4339,8 +4649,7 @@ def _maybe_casted_values(index, labels=None):
values = values._data
if mask.any():
- values, changed = maybe_upcast_putmask(
- values, mask, np.nan)
+ values, changed = maybe_upcast_putmask(values, mask, np.nan)
if issubclass(values_type, DatetimeLikeArray):
values = values_type(values, dtype=values_dtype)
@@ -4357,13 +4666,14 @@ def _maybe_casted_values(index, labels=None):
if not drop:
if isinstance(self.index, MultiIndex):
- names = [n if n is not None else ('level_%d' % i)
- for (i, n) in enumerate(self.index.names)]
+ names = [
+ n if n is not None else ("level_%d" % i)
+ for (i, n) in enumerate(self.index.names)
+ ]
to_insert = zip(self.index.levels, self.index.codes)
else:
- default = 'index' if 'index' not in self else 'level_0'
- names = ([default] if self.index.name is None
- else [self.index.name])
+ default = "index" if "index" not in self else "level_0"
+ names = [default] if self.index.name is None else [self.index.name]
to_insert = ((self.index, None),)
multi_col = isinstance(self.columns, MultiIndex)
@@ -4372,13 +4682,14 @@ def _maybe_casted_values(index, labels=None):
continue
name = names[i]
if multi_col:
- col_name = (list(name) if isinstance(name, tuple)
- else [name])
+ col_name = list(name) if isinstance(name, tuple) else [name]
if col_fill is None:
if len(col_name) not in (1, self.columns.nlevels):
- raise ValueError("col_fill=None is incompatible "
- "with incomplete column name "
- "{}".format(name))
+ raise ValueError(
+ "col_fill=None is incompatible "
+ "with incomplete column name "
+ "{}".format(name)
+ )
col_fill = col_name[0]
lev_num = self.columns._get_level_number(col_level)
@@ -4397,24 +4708,23 @@ def _maybe_casted_values(index, labels=None):
# ----------------------------------------------------------------------
# Reindex-based selection methods
- @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
+ @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
def isna(self):
return super().isna()
- @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
+ @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
def isnull(self):
return super().isnull()
- @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
+ @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
def notna(self):
return super().notna()
- @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
+ @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
def notnull(self):
return super().notnull()
- def dropna(self, axis=0, how='any', thresh=None, subset=None,
- inplace=False):
+ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False):
"""
Remove missing values.
@@ -4517,17 +4827,18 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
name toy born
1 Batman Batmobile 1940-04-25
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if isinstance(axis, (tuple, list)):
# GH20987
- msg = ("supplying multiple axes to axis is deprecated and "
- "will be removed in a future version.")
+ msg = (
+ "supplying multiple axes to axis is deprecated and "
+ "will be removed in a future version."
+ )
warnings.warn(msg, FutureWarning, stacklevel=2)
result = self
for ax in axis:
- result = result.dropna(how=how, thresh=thresh, subset=subset,
- axis=ax)
+ result = result.dropna(how=how, thresh=thresh, subset=subset, axis=ax)
else:
axis = self._get_axis_number(axis)
agg_axis = 1 - axis
@@ -4545,15 +4856,15 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
if thresh is not None:
mask = count >= thresh
- elif how == 'any':
+ elif how == "any":
mask = count == len(agg_obj._get_axis(agg_axis))
- elif how == 'all':
+ elif how == "all":
mask = count > 0
else:
if how is not None:
- raise ValueError('invalid how option: {h}'.format(h=how))
+ raise ValueError("invalid how option: {h}".format(h=how))
else:
- raise TypeError('must specify how or thresh')
+ raise TypeError("must specify how or thresh")
result = self.loc(axis=axis)[mask]
@@ -4562,7 +4873,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
else:
return result
- def drop_duplicates(self, subset=None, keep='first', inplace=False):
+ def drop_duplicates(self, subset=None, keep="first", inplace=False):
"""
Return DataFrame with duplicate rows removed, optionally only
considering certain columns. Indexes, including time indexes
@@ -4587,7 +4898,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False):
if self.empty:
return self.copy()
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
duplicated = self.duplicated(subset, keep=keep)
if inplace:
@@ -4597,7 +4908,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False):
else:
return self[-duplicated]
- def duplicated(self, subset=None, keep='first'):
+ def duplicated(self, subset=None, keep="first"):
"""
Return boolean Series denoting duplicate rows, optionally only
considering certain columns.
@@ -4626,15 +4937,19 @@ def duplicated(self, subset=None, keep='first'):
def f(vals):
labels, shape = algorithms.factorize(
- vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
- return labels.astype('i8', copy=False), len(shape)
+ vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)
+ )
+ return labels.astype("i8", copy=False), len(shape)
if subset is None:
subset = self.columns
- elif (not np.iterable(subset) or
- isinstance(subset, str) or
- isinstance(subset, tuple) and subset in self.columns):
- subset = subset,
+ elif (
+ not np.iterable(subset)
+ or isinstance(subset, str)
+ or isinstance(subset, tuple)
+ and subset in self.columns
+ ):
+ subset = (subset,)
# Verify all columns in subset exist in the queried dataframe
# Otherwise, raise a KeyError, same as if you try to __getitem__ with a
@@ -4643,8 +4958,7 @@ def f(vals):
if not diff.empty:
raise KeyError(diff)
- vals = (col.values for name, col in self.iteritems()
- if name in subset)
+ vals = (col.values for name, col in self.iteritems() if name in subset)
labels, shape = map(list, zip(*map(f, vals)))
ids = get_group_index(labels, shape, sort=False, xnull=False)
@@ -4655,23 +4969,30 @@ def f(vals):
@Substitution(**_shared_doc_kwargs)
@Appender(NDFrame.sort_values.__doc__)
- def sort_values(self, by, axis=0, ascending=True, inplace=False,
- kind='quicksort', na_position='last'):
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ def sort_values(
+ self,
+ by,
+ axis=0,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ ):
+ inplace = validate_bool_kwarg(inplace, "inplace")
axis = self._get_axis_number(axis)
if not isinstance(by, list):
by = [by]
if is_sequence(ascending) and len(by) != len(ascending):
- raise ValueError('Length of ascending (%d) != length of by (%d)' %
- (len(ascending), len(by)))
+ raise ValueError(
+ "Length of ascending (%d) != length of by (%d)"
+ % (len(ascending), len(by))
+ )
if len(by) > 1:
from pandas.core.sorting import lexsort_indexer
- keys = [self._get_label_or_level_values(x, axis=axis)
- for x in by]
- indexer = lexsort_indexer(keys, orders=ascending,
- na_position=na_position)
+ keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
+ indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position)
indexer = ensure_platform_int(indexer)
else:
from pandas.core.sorting import nargsort
@@ -4682,12 +5003,13 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
if isinstance(ascending, (tuple, list)):
ascending = ascending[0]
- indexer = nargsort(k, kind=kind, ascending=ascending,
- na_position=na_position)
+ indexer = nargsort(
+ k, kind=kind, ascending=ascending, na_position=na_position
+ )
- new_data = self._data.take(indexer,
- axis=self._get_block_manager_axis(axis),
- verify=False)
+ new_data = self._data.take(
+ indexer, axis=self._get_block_manager_axis(axis), verify=False
+ )
if inplace:
return self._update_inplace(new_data)
@@ -4696,23 +5018,33 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False,
@Substitution(**_shared_doc_kwargs)
@Appender(NDFrame.sort_index.__doc__)
- def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
- kind='quicksort', na_position='last', sort_remaining=True,
- by=None):
+ def sort_index(
+ self,
+ axis=0,
+ level=None,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ sort_remaining=True,
+ by=None,
+ ):
# TODO: this can be combined with Series.sort_index impl as
# almost identical
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
# 10726
if by is not None:
- warnings.warn("by argument to sort_index is deprecated, "
- "please use .sort_values(by=...)",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "by argument to sort_index is deprecated, "
+ "please use .sort_values(by=...)",
+ FutureWarning,
+ stacklevel=2,
+ )
if level is not None:
raise ValueError("unable to simultaneously sort by and level")
- return self.sort_values(by, axis=axis, ascending=ascending,
- inplace=inplace)
+ return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace)
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
@@ -4722,34 +5054,37 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
labels = labels._sort_levels_monotonic()
if level is not None:
- new_axis, indexer = labels.sortlevel(level, ascending=ascending,
- sort_remaining=sort_remaining)
+ new_axis, indexer = labels.sortlevel(
+ level, ascending=ascending, sort_remaining=sort_remaining
+ )
elif isinstance(labels, MultiIndex):
from pandas.core.sorting import lexsort_indexer
- indexer = lexsort_indexer(labels._get_codes_for_sorting(),
- orders=ascending,
- na_position=na_position)
+ indexer = lexsort_indexer(
+ labels._get_codes_for_sorting(),
+ orders=ascending,
+ na_position=na_position,
+ )
else:
from pandas.core.sorting import nargsort
# Check monotonic-ness before sort an index
# GH11080
- if ((ascending and labels.is_monotonic_increasing) or
- (not ascending and labels.is_monotonic_decreasing)):
+ if (ascending and labels.is_monotonic_increasing) or (
+ not ascending and labels.is_monotonic_decreasing
+ ):
if inplace:
return
else:
return self.copy()
- indexer = nargsort(labels, kind=kind, ascending=ascending,
- na_position=na_position)
+ indexer = nargsort(
+ labels, kind=kind, ascending=ascending, na_position=na_position
+ )
baxis = self._get_block_manager_axis(axis)
- new_data = self._data.take(indexer,
- axis=baxis,
- verify=False)
+ new_data = self._data.take(indexer, axis=baxis, verify=False)
# reconstruct axis if needed
new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
@@ -4759,7 +5094,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
else:
return self._constructor(new_data).__finalize__(self)
- def nlargest(self, n, columns, keep='first'):
+ def nlargest(self, n, columns, keep="first"):
"""
Return the first `n` rows ordered by `columns` in descending order.
@@ -4866,12 +5201,9 @@ def nlargest(self, n, columns, keep='first'):
Italy 59000000 1937894 IT
Brunei 434000 12128 BN
"""
- return algorithms.SelectNFrame(self,
- n=n,
- keep=keep,
- columns=columns).nlargest()
+ return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
- def nsmallest(self, n, columns, keep='first'):
+ def nsmallest(self, n, columns, keep="first"):
"""
Return the first `n` rows ordered by `columns` in ascending order.
@@ -4968,10 +5300,9 @@ def nsmallest(self, n, columns, keep='first'):
Nauru 11300 182 NR
Anguilla 11300 311 AI
"""
- return algorithms.SelectNFrame(self,
- n=n,
- keep=keep,
- columns=columns).nsmallest()
+ return algorithms.SelectNFrame(
+ self, n=n, keep=keep, columns=columns
+ ).nsmallest()
def swaplevel(self, i=-2, j=-1, axis=0):
"""
@@ -5018,9 +5349,8 @@ def reorder_levels(self, order, axis=0):
type of caller (new object)
"""
axis = self._get_axis_number(axis)
- if not isinstance(self._get_axis(axis),
- MultiIndex): # pragma: no cover
- raise TypeError('Can only reorder levels on a hierarchical axis.')
+ if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover
+ raise TypeError("Can only reorder levels on a hierarchical axis.")
result = self.copy()
@@ -5034,7 +5364,7 @@ def reorder_levels(self, order, axis=0):
# Arithmetic / combination related
def _combine_frame(self, other, func, fill_value=None, level=None):
- this, other = self.align(other, join='outer', level=level, copy=False)
+ this, other = self.align(other, join="outer", level=level, copy=False)
new_index, new_columns = this.index, this.columns
def _arith_op(left, right):
@@ -5049,13 +5379,12 @@ def _arith_op(left, right):
return ops.dispatch_to_series(this, other, _arith_op)
else:
result = _arith_op(this.values, other.values)
- return self._constructor(result,
- index=new_index, columns=new_columns,
- copy=False)
+ return self._constructor(
+ result, index=new_index, columns=new_columns, copy=False
+ )
def _combine_match_index(self, other, func, level=None):
- left, right = self.align(other, join='outer', axis=0, level=level,
- copy=False)
+ left, right = self.align(other, join="outer", axis=0, level=level, copy=False)
assert left.index.equals(right.index)
if left._is_mixed_type or right._is_mixed_type:
@@ -5065,14 +5394,13 @@ def _combine_match_index(self, other, func, level=None):
# fastpath --> operate directly on values
with np.errstate(all="ignore"):
new_data = func(left.values.T, right.values).T
- return self._constructor(new_data,
- index=left.index, columns=self.columns,
- copy=False)
+ return self._constructor(
+ new_data, index=left.index, columns=self.columns, copy=False
+ )
def _combine_match_columns(self, other, func, level=None):
assert isinstance(other, Series)
- left, right = self.align(other, join='outer', axis=1, level=level,
- copy=False)
+ left, right = self.align(other, join="outer", axis=1, level=level, copy=False)
assert left.columns.equals(right.index)
return ops.dispatch_to_series(left, right, func, axis="columns")
@@ -5245,8 +5573,7 @@ def combine(self, other, func, fill_value=None, overwrite=True):
result[col] = arr
# convert_objects just in case
- return self._constructor(result, index=new_index,
- columns=new_columns)
+ return self._constructor(result, index=new_index, columns=new_columns)
def combine_first(self, other):
"""
@@ -5304,7 +5631,7 @@ def extract_values(arr):
if is_extension_array_dtype(arr.dtype):
arr = arr.asi8
else:
- arr = arr.view('i8')
+ arr = arr.view("i8")
return arr
def combiner(x, y):
@@ -5324,10 +5651,14 @@ def combiner(x, y):
return self.combine(other, combiner, overwrite=False)
- @deprecate_kwarg(old_arg_name='raise_conflict', new_arg_name='errors',
- mapping={False: 'ignore', True: 'raise'})
- def update(self, other, join='left', overwrite=True, filter_func=None,
- errors='ignore'):
+ @deprecate_kwarg(
+ old_arg_name="raise_conflict",
+ new_arg_name="errors",
+ mapping={False: "ignore", True: "raise"},
+ )
+ def update(
+ self, other, join="left", overwrite=True, filter_func=None, errors="ignore"
+ ):
"""
Modify in place using non-NA values from another DataFrame.
@@ -5440,12 +5771,14 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
2 3 6.0
"""
import pandas.core.computation.expressions as expressions
+
# TODO: Support other joins
- if join != 'left': # pragma: no cover
+ if join != "left": # pragma: no cover
raise NotImplementedError("Only left join is supported")
- if errors not in ['ignore', 'raise']:
- raise ValueError("The parameter errors must be either "
- "'ignore' or 'raise'")
+ if errors not in ["ignore", "raise"]:
+ raise ValueError(
+ "The parameter errors must be either " "'ignore' or 'raise'"
+ )
if not isinstance(other, DataFrame):
other = DataFrame(other)
@@ -5456,10 +5789,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
this = self[col]._values
that = other[col]._values
if filter_func is not None:
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
mask = ~filter_func(this) | isna(that)
else:
- if errors == 'raise':
+ if errors == "raise":
mask_this = notna(that)
mask_that = notna(this)
if any(mask_this & mask_that):
@@ -5479,7 +5812,9 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
# ----------------------------------------------------------------------
# Data reshaping
- _shared_docs['pivot'] = """
+ _shared_docs[
+ "pivot"
+ ] = """
Return reshaped DataFrame organized by given index / column values.
Reshape data (produce a "pivot" table) based on column values. Uses
@@ -5582,13 +5917,16 @@ def update(self, other, join='left', overwrite=True, filter_func=None,
ValueError: Index contains duplicate entries, cannot reshape
"""
- @Substitution('')
- @Appender(_shared_docs['pivot'])
+ @Substitution("")
+ @Appender(_shared_docs["pivot"])
def pivot(self, index=None, columns=None, values=None):
from pandas.core.reshape.pivot import pivot
+
return pivot(self, index=index, columns=columns, values=values)
- _shared_docs['pivot_table'] = """
+ _shared_docs[
+ "pivot_table"
+ ] = """
Create a spreadsheet-style pivot table as a DataFrame. The levels in
the pivot table will be stored in MultiIndex objects (hierarchical
indexes) on the index and columns of the result DataFrame.
@@ -5713,16 +6051,34 @@ def pivot(self, index=None, columns=None, values=None):
small 2.333333 6.0 4.333333 2.0
"""
- @Substitution('')
- @Appender(_shared_docs['pivot_table'])
- def pivot_table(self, values=None, index=None, columns=None,
- aggfunc='mean', fill_value=None, margins=False,
- dropna=True, margins_name='All', observed=False):
+ @Substitution("")
+ @Appender(_shared_docs["pivot_table"])
+ def pivot_table(
+ self,
+ values=None,
+ index=None,
+ columns=None,
+ aggfunc="mean",
+ fill_value=None,
+ margins=False,
+ dropna=True,
+ margins_name="All",
+ observed=False,
+ ):
from pandas.core.reshape.pivot import pivot_table
- return pivot_table(self, values=values, index=index, columns=columns,
- aggfunc=aggfunc, fill_value=fill_value,
- margins=margins, dropna=dropna,
- margins_name=margins_name, observed=observed)
+
+ return pivot_table(
+ self,
+ values=values,
+ index=index,
+ columns=columns,
+ aggfunc=aggfunc,
+ fill_value=fill_value,
+ margins=margins,
+ dropna=dropna,
+ margins_name=margins_name,
+ observed=observed,
+ )
def stack(self, level=-1, dropna=True):
"""
@@ -5955,9 +6311,12 @@ def unstack(self, level=-1, fill_value=None):
dtype: float64
"""
from pandas.core.reshape.reshape import unstack
+
return unstack(self, level, fill_value)
- _shared_docs['melt'] = ("""
+ _shared_docs[
+ "melt"
+ ] = """
Unpivot a DataFrame from wide format to long format, optionally
leaving identifier variables set.
@@ -6050,18 +6409,32 @@ def unstack(self, level=-1, fill_value=None):
0 a B E 1
1 b B E 3
2 c B E 5
- """)
-
- @Appender(_shared_docs['melt'] %
- dict(caller='df.melt(',
- versionadded='.. versionadded:: 0.20.0\n',
- other='melt'))
- def melt(self, id_vars=None, value_vars=None, var_name=None,
- value_name='value', col_level=None):
+ """
+
+ @Appender(
+ _shared_docs["melt"]
+ % dict(
+ caller="df.melt(", versionadded=".. versionadded:: 0.20.0\n", other="melt"
+ )
+ )
+ def melt(
+ self,
+ id_vars=None,
+ value_vars=None,
+ var_name=None,
+ value_name="value",
+ col_level=None,
+ ):
from pandas.core.reshape.melt import melt
- return melt(self, id_vars=id_vars, value_vars=value_vars,
- var_name=var_name, value_name=value_name,
- col_level=col_level)
+
+ return melt(
+ self,
+ id_vars=id_vars,
+ value_vars=value_vars,
+ var_name=var_name,
+ value_name=value_name,
+ col_level=col_level,
+ )
# ----------------------------------------------------------------------
# Time series-related
@@ -6160,11 +6533,12 @@ def diff(self, periods=1, axis=0):
# ----------------------------------------------------------------------
# Function application
- def _gotitem(self,
- key: Union[str, List[str]],
- ndim: int,
- subset: Optional[Union[Series, ABCDataFrame]] = None,
- ) -> Union[Series, ABCDataFrame]:
+ def _gotitem(
+ self,
+ key: Union[str, List[str]],
+ ndim: int,
+ subset: Optional[Union[Series, ABCDataFrame]] = None,
+ ) -> Union[Series, ABCDataFrame]:
"""
Sub-classes to define. Return a sliced object.
@@ -6184,7 +6558,8 @@ def _gotitem(self,
# TODO: _shallow_copy(subset)?
return subset[key]
- _agg_summary_and_see_also_doc = dedent("""
+ _agg_summary_and_see_also_doc = dedent(
+ """
The aggregation operations are always performed over an axis, either the
index (default) or the column axis. This behavior is different from
`numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
@@ -6204,9 +6579,11 @@ def _gotitem(self,
core.window.Expanding : Perform operations over expanding window.
core.window.EWM : Perform operation over exponential weighted
window.
- """)
+ """
+ )
- _agg_examples_doc = dedent("""
+ _agg_examples_doc = dedent(
+ """
Examples
--------
>>> df = pd.DataFrame([[1, 2, 3],
@@ -6238,13 +6615,16 @@ def _gotitem(self,
2 8.0
3 NaN
dtype: float64
- """)
-
- @Substitution(see_also=_agg_summary_and_see_also_doc,
- examples=_agg_examples_doc,
- versionadded='\n.. versionadded:: 0.20.0\n',
- **_shared_doc_kwargs)
- @Appender(_shared_docs['aggregate'])
+ """
+ )
+
+ @Substitution(
+ see_also=_agg_summary_and_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded="\n.. versionadded:: 0.20.0\n",
+ **_shared_doc_kwargs
+ )
+ @Appender(_shared_docs["aggregate"])
def aggregate(self, func, axis=0, *args, **kwargs):
axis = self._get_axis_number(axis)
@@ -6268,15 +6648,24 @@ def _aggregate(self, arg, axis=0, *args, **kwargs):
agg = aggregate
- @Appender(_shared_docs['transform'] % _shared_doc_kwargs)
+ @Appender(_shared_docs["transform"] % _shared_doc_kwargs)
def transform(self, func, axis=0, *args, **kwargs):
axis = self._get_axis_number(axis)
if axis == 1:
return self.T.transform(func, *args, **kwargs).T
return super().transform(func, *args, **kwargs)
- def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None,
- result_type=None, args=(), **kwds):
+ def apply(
+ self,
+ func,
+ axis=0,
+ broadcast=None,
+ raw=False,
+ reduce=None,
+ result_type=None,
+ args=(),
+ **kwds
+ ):
"""
Apply a function along an axis of the DataFrame.
@@ -6445,15 +6834,18 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None,
2 1 2
"""
from pandas.core.apply import frame_apply
- op = frame_apply(self,
- func=func,
- axis=axis,
- broadcast=broadcast,
- raw=raw,
- reduce=reduce,
- result_type=result_type,
- args=args,
- kwds=kwds)
+
+ op = frame_apply(
+ self,
+ func=func,
+ axis=axis,
+ broadcast=broadcast,
+ raw=raw,
+ reduce=reduce,
+ result_type=result_type,
+ args=args,
+ kwds=kwds,
+ )
return op.get_result()
def applymap(self, func):
@@ -6525,8 +6917,7 @@ def infer(x):
# ----------------------------------------------------------------------
# Merging / joining methods
- def append(self, other, ignore_index=False,
- verify_integrity=False, sort=None):
+ def append(self, other, ignore_index=False, verify_integrity=False, sort=None):
"""
Append rows of `other` to the end of caller, returning a new object.
@@ -6624,8 +7015,10 @@ def append(self, other, ignore_index=False,
if isinstance(other, dict):
other = Series(other)
if other.name is None and not ignore_index:
- raise TypeError('Can only append a Series if ignore_index=True'
- ' or if the Series has a name')
+ raise TypeError(
+ "Can only append a Series if ignore_index=True"
+ " or if the Series has a name"
+ )
if other.name is None:
index = None
@@ -6640,9 +7033,11 @@ def append(self, other, ignore_index=False,
except TypeError:
combined_columns = self.columns.astype(object).append(idx_diff)
other = other.reindex(combined_columns, copy=False)
- other = DataFrame(other.values.reshape((1, len(other))),
- index=index,
- columns=combined_columns)
+ other = DataFrame(
+ other.values.reshape((1, len(other))),
+ index=index,
+ columns=combined_columns,
+ )
other = other._convert(datetime=True, timedelta=True)
if not self.columns.equals(combined_columns):
self = self.reindex(columns=combined_columns)
@@ -6652,16 +7047,19 @@ def append(self, other, ignore_index=False,
other = other.reindex(columns=self.columns)
from pandas.core.reshape.concat import concat
+
if isinstance(other, (list, tuple)):
to_concat = [self] + other
else:
to_concat = [self, other]
- return concat(to_concat, ignore_index=ignore_index,
- verify_integrity=verify_integrity,
- sort=sort)
+ return concat(
+ to_concat,
+ ignore_index=ignore_index,
+ verify_integrity=verify_integrity,
+ sort=sort,
+ )
- def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
- sort=False):
+ def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False):
"""
Join columns of another DataFrame.
@@ -6780,27 +7178,37 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
5 K5 A5 NaN
"""
# For SparseDataFrame's benefit
- return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
- rsuffix=rsuffix, sort=sort)
+ return self._join_compat(
+ other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort
+ )
- def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
- sort=False):
+ def _join_compat(
+ self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
+ ):
from pandas.core.reshape.merge import merge
from pandas.core.reshape.concat import concat
if isinstance(other, Series):
if other.name is None:
- raise ValueError('Other Series must have a name')
+ raise ValueError("Other Series must have a name")
other = DataFrame({other.name: other})
if isinstance(other, DataFrame):
- return merge(self, other, left_on=on, how=how,
- left_index=on is None, right_index=True,
- suffixes=(lsuffix, rsuffix), sort=sort)
+ return merge(
+ self,
+ other,
+ left_on=on,
+ how=how,
+ left_index=on is None,
+ right_index=True,
+ suffixes=(lsuffix, rsuffix),
+ sort=sort,
+ )
else:
if on is not None:
- raise ValueError('Joining multiple DataFrames only supported'
- ' for joining on index')
+ raise ValueError(
+ "Joining multiple DataFrames only supported" " for joining on index"
+ )
frames = [self] + list(other)
@@ -6808,33 +7216,55 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
# join indexes only using concat
if can_concat:
- if how == 'left':
- res = concat(frames, axis=1, join='outer',
- verify_integrity=True)
+ if how == "left":
+ res = concat(frames, axis=1, join="outer", verify_integrity=True)
return res.reindex(self.index, copy=False)
else:
- return concat(frames, axis=1, join=how,
- verify_integrity=True)
+ return concat(frames, axis=1, join=how, verify_integrity=True)
joined = frames[0]
for frame in frames[1:]:
- joined = merge(joined, frame, how=how, left_index=True,
- right_index=True)
+ joined = merge(
+ joined, frame, how=how, left_index=True, right_index=True
+ )
return joined
- @Substitution('')
+ @Substitution("")
@Appender(_merge_doc, indents=2)
- def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
- left_index=False, right_index=False, sort=False,
- suffixes=('_x', '_y'), copy=True, indicator=False,
- validate=None):
+ def merge(
+ self,
+ right,
+ how="inner",
+ on=None,
+ left_on=None,
+ right_on=None,
+ left_index=False,
+ right_index=False,
+ sort=False,
+ suffixes=("_x", "_y"),
+ copy=True,
+ indicator=False,
+ validate=None,
+ ):
from pandas.core.reshape.merge import merge
- return merge(self, right, how=how, on=on, left_on=left_on,
- right_on=right_on, left_index=left_index,
- right_index=right_index, sort=sort, suffixes=suffixes,
- copy=copy, indicator=indicator, validate=validate)
+
+ return merge(
+ self,
+ right,
+ how=how,
+ on=on,
+ left_on=left_on,
+ right_on=right_on,
+ left_index=left_index,
+ right_index=right_index,
+ sort=sort,
+ suffixes=suffixes,
+ copy=copy,
+ indicator=indicator,
+ validate=validate,
+ )
def round(self, decimals=0, *args, **kwargs):
"""
@@ -6936,23 +7366,21 @@ def _series_round(s, decimals):
new_cols = [col for col in _dict_round(self, decimals)]
elif is_integer(decimals):
# Dispatch to Series.round
- new_cols = [_series_round(v, decimals)
- for _, v in self.iteritems()]
+ new_cols = [_series_round(v, decimals) for _, v in self.iteritems()]
else:
- raise TypeError("decimals must be an integer, a dict-like or a "
- "Series")
+ raise TypeError("decimals must be an integer, a dict-like or a " "Series")
if len(new_cols) > 0:
- return self._constructor(concat(new_cols, axis=1),
- index=self.index,
- columns=self.columns)
+ return self._constructor(
+ concat(new_cols, axis=1), index=self.index, columns=self.columns
+ )
else:
return self
# ----------------------------------------------------------------------
# Statistical methods, etc.
- def corr(self, method='pearson', min_periods=1):
+ def corr(self, method="pearson", min_periods=1):
"""
Compute pairwise correlation of columns, excluding NA/null values.
@@ -7000,12 +7428,11 @@ def corr(self, method='pearson', min_periods=1):
idx = cols.copy()
mat = numeric_df.values
- if method == 'pearson':
+ if method == "pearson":
correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods)
- elif method == 'spearman':
- correl = libalgos.nancorr_spearman(ensure_float64(mat),
- minp=min_periods)
- elif method == 'kendall' or callable(method):
+ elif method == "spearman":
+ correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods)
+ elif method == "kendall" or callable(method):
if min_periods is None:
min_periods = 1
mat = ensure_float64(mat).T
@@ -7022,7 +7449,7 @@ def corr(self, method='pearson', min_periods=1):
if valid.sum() < min_periods:
c = np.nan
elif i == j:
- c = 1.
+ c = 1.0
elif not valid.all():
c = corrf(ac[valid], bc[valid])
else:
@@ -7030,9 +7457,11 @@ def corr(self, method='pearson', min_periods=1):
correl[i, j] = c
correl[j, i] = c
else:
- raise ValueError("method must be either 'pearson', "
- "'spearman', 'kendall', or a callable, "
- "'{method}' was supplied".format(method=method))
+ raise ValueError(
+ "method must be either 'pearson', "
+ "'spearman', 'kendall', or a callable, "
+ "'{method}' was supplied".format(method=method)
+ )
return self._constructor(correl, index=idx, columns=cols)
@@ -7142,12 +7571,11 @@ def cov(self, min_periods=None):
baseCov = np.cov(mat.T)
baseCov = baseCov.reshape((len(cols), len(cols)))
else:
- baseCov = libalgos.nancorr(ensure_float64(mat), cov=True,
- minp=min_periods)
+ baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods)
return self._constructor(baseCov, index=idx, columns=cols)
- def corrwith(self, other, axis=0, drop=False, method='pearson'):
+ def corrwith(self, other, axis=0, drop=False, method="pearson"):
"""
Compute pairwise correlation between rows or columns of DataFrame
with rows or columns of Series or DataFrame. DataFrames are first
@@ -7183,17 +7611,16 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'):
this = self._get_numeric_data()
if isinstance(other, Series):
- return this.apply(lambda x: other.corr(x, method=method),
- axis=axis)
+ return this.apply(lambda x: other.corr(x, method=method), axis=axis)
other = other._get_numeric_data()
- left, right = this.align(other, join='inner', copy=False)
+ left, right = this.align(other, join="inner", copy=False)
if axis == 1:
left = left.T
right = right.T
- if method == 'pearson':
+ if method == "pearson":
# mask missing values
left = left + right * 0
right = right + left * 0
@@ -7207,31 +7634,31 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'):
correl = num / dom
- elif method in ['kendall', 'spearman'] or callable(method):
+ elif method in ["kendall", "spearman"] or callable(method):
+
def c(x):
return nanops.nancorr(x[0], x[1], method=method)
- correl = Series(map(c,
- zip(left.values.T, right.values.T)),
- index=left.columns)
+ correl = Series(
+ map(c, zip(left.values.T, right.values.T)), index=left.columns
+ )
else:
- raise ValueError("Invalid method {method} was passed, "
- "valid methods are: 'pearson', 'kendall', "
- "'spearman', or callable".
- format(method=method))
+ raise ValueError(
+ "Invalid method {method} was passed, "
+ "valid methods are: 'pearson', 'kendall', "
+ "'spearman', or callable".format(method=method)
+ )
if not drop:
# Find non-matching labels along the given axis
# and append missing correlations (GH 22375)
raxis = 1 if axis == 0 else 0
- result_index = (this._get_axis(raxis).
- union(other._get_axis(raxis)))
+ result_index = this._get_axis(raxis).union(other._get_axis(raxis))
idx_diff = result_index.difference(correl.index)
if len(idx_diff) > 0:
- correl = correl.append(Series([np.nan] * len(idx_diff),
- index=idx_diff))
+ correl = correl.append(Series([np.nan] * len(idx_diff), index=idx_diff))
return correl
@@ -7316,8 +7743,7 @@ def count(self, axis=0, level=None, numeric_only=False):
"""
axis = self._get_axis_number(axis)
if level is not None:
- return self._count_level(level, axis=axis,
- numeric_only=numeric_only)
+ return self._count_level(level, axis=axis, numeric_only=numeric_only)
if numeric_only:
frame = self._get_numeric_data()
@@ -7338,7 +7764,7 @@ def count(self, axis=0, level=None, numeric_only=False):
counts = series_counts.values
result = Series(counts, index=frame._get_agg_axis(axis))
- return result.astype('int64')
+ return result.astype("int64")
def _count_level(self, level, axis=0, numeric_only=False):
if numeric_only:
@@ -7350,8 +7776,10 @@ def _count_level(self, level, axis=0, numeric_only=False):
agg_axis = frame._get_agg_axis(axis)
if not isinstance(count_axis, MultiIndex):
- raise TypeError("Can only count levels on hierarchical "
- "{ax}.".format(ax=self._get_axis_name(axis)))
+ raise TypeError(
+ "Can only count levels on hierarchical "
+ "{ax}.".format(ax=self._get_axis_name(axis))
+ )
if frame._is_mixed_type:
# Since we have mixed types, calling notna(frame.values) might
@@ -7371,8 +7799,7 @@ def _count_level(self, level, axis=0, numeric_only=False):
level_index = count_axis.levels[level]
level_codes = ensure_int64(count_axis.codes[level])
- counts = lib.count_level_2d(mask, level_codes, len(level_index),
- axis=0)
+ counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0)
result = DataFrame(counts, index=level_index, columns=agg_axis)
@@ -7382,9 +7809,10 @@ def _count_level(self, level, axis=0, numeric_only=False):
else:
return result
- def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
- filter_type=None, **kwds):
- if axis is None and filter_type == 'bool':
+ def _reduce(
+ self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
+ ):
+ if axis is None and filter_type == "bool":
labels = None
constructor = None
else:
@@ -7397,9 +7825,14 @@ def f(x):
return op(x, axis=axis, skipna=skipna, **kwds)
# exclude timedelta/datetime unless we are uniform types
- if (axis == 1 and self._is_datelike_mixed_type
- and (not self._is_homogeneous_type
- and not is_datetime64tz_dtype(self.dtypes[0]))):
+ if (
+ axis == 1
+ and self._is_datelike_mixed_type
+ and (
+ not self._is_homogeneous_type
+ and not is_datetime64tz_dtype(self.dtypes[0])
+ )
+ ):
numeric_only = True
if numeric_only is None:
@@ -7407,8 +7840,7 @@ def f(x):
values = self.values
result = f(values)
- if (filter_type == 'bool' and is_object_dtype(values) and
- axis is None):
+ if filter_type == "bool" and is_object_dtype(values) and axis is None:
# work around https://github.com/numpy/numpy/issues/10489
# TODO: combine with hasattr(result, 'dtype') further down
# hard since we don't have `values` down there.
@@ -7428,10 +7860,10 @@ def f(x):
# column-by-column reduction, where we have mixed type.
# So let's just do what we can
from pandas.core.apply import frame_apply
- opa = frame_apply(self,
- func=f,
- result_type='expand',
- ignore_failures=True)
+
+ opa = frame_apply(
+ self, func=f, result_type="expand", ignore_failures=True
+ )
result = opa.get_result()
if result.ndim == self.ndim:
result = result.iloc[0]
@@ -7439,28 +7871,31 @@ def f(x):
except Exception:
pass
- if filter_type is None or filter_type == 'numeric':
+ if filter_type is None or filter_type == "numeric":
data = self._get_numeric_data()
- elif filter_type == 'bool':
+ elif filter_type == "bool":
data = self._get_bool_data()
else: # pragma: no cover
e = NotImplementedError(
"Handling exception with filter_type {f} not"
- "implemented.".format(f=filter_type))
+ "implemented.".format(f=filter_type)
+ )
raise_with_traceback(e)
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = f(data.values)
labels = data._get_agg_axis(axis)
else:
if numeric_only:
- if filter_type is None or filter_type == 'numeric':
+ if filter_type is None or filter_type == "numeric":
data = self._get_numeric_data()
- elif filter_type == 'bool':
+ elif filter_type == "bool":
# GH 25101, # GH 24434
data = self._get_bool_data() if axis == 0 else self
else: # pragma: no cover
- msg = ("Generating numeric_only data with filter_type {f}"
- "not supported.".format(f=filter_type))
+ msg = (
+ "Generating numeric_only data with filter_type {f}"
+ "not supported.".format(f=filter_type)
+ )
raise NotImplementedError(msg)
values = data.values
labels = data._get_agg_axis(axis)
@@ -7468,11 +7903,11 @@ def f(x):
values = self.values
result = f(values)
- if hasattr(result, 'dtype') and is_object_dtype(result.dtype):
+ if hasattr(result, "dtype") and is_object_dtype(result.dtype):
try:
- if filter_type is None or filter_type == 'numeric':
+ if filter_type is None or filter_type == "numeric":
result = result.astype(np.float64)
- elif filter_type == 'bool' and notna(result).all():
+ elif filter_type == "bool" and notna(result).all():
result = result.astype(np.bool_)
except (ValueError, TypeError):
@@ -7609,7 +8044,7 @@ def _get_agg_axis(self, axis_num):
elif axis_num == 1:
return self.index
else:
- raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
+ raise ValueError("Axis must be 0 or 1 (got %r)" % axis_num)
def mode(self, axis=0, numeric_only=False, dropna=True):
"""
@@ -7697,8 +8132,7 @@ def f(s):
return data.apply(f, axis=axis)
- def quantile(self, q=0.5, axis=0, numeric_only=True,
- interpolation='linear'):
+ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
"""
Return values at the given quantile over requested axis.
@@ -7775,10 +8209,9 @@ def quantile(self, q=0.5, axis=0, numeric_only=True,
if is_transposed:
data = data.T
- result = data._data.quantile(qs=q,
- axis=1,
- interpolation=interpolation,
- transposed=is_transposed)
+ result = data._data.quantile(
+ qs=q, axis=1, interpolation=interpolation, transposed=is_transposed
+ )
if result.ndim == 2:
result = self._constructor(result)
@@ -7790,7 +8223,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True,
return result
- def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
+ def to_timestamp(self, freq=None, how="start", axis=0, copy=True):
"""
Cast to DatetimeIndex of timestamps, at *beginning* of period.
@@ -7820,8 +8253,7 @@ def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
elif axis == 1:
new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))
else: # pragma: no cover
- raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
- ax=axis))
+ raise AssertionError("Axis must be 0 or 1. Got {ax!s}".format(ax=axis))
return self._constructor(new_data)
@@ -7853,8 +8285,7 @@ def to_period(self, freq=None, axis=0, copy=True):
elif axis == 1:
new_data.set_axis(0, self.columns.to_period(freq=freq))
else: # pragma: no cover
- raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
- ax=axis))
+ raise AssertionError("Axis must be 0 or 1. Got {ax!s}".format(ax=axis))
return self._constructor(new_data)
@@ -7923,29 +8354,36 @@ def isin(self, values):
"""
if isinstance(values, dict):
from pandas.core.reshape.concat import concat
+
values = collections.defaultdict(list, values)
- return concat((self.iloc[:, [i]].isin(values[col])
- for i, col in enumerate(self.columns)), axis=1)
+ return concat(
+ (
+ self.iloc[:, [i]].isin(values[col])
+ for i, col in enumerate(self.columns)
+ ),
+ axis=1,
+ )
elif isinstance(values, Series):
if not values.index.is_unique:
- raise ValueError("cannot compute isin with "
- "a duplicate axis.")
- return self.eq(values.reindex_like(self), axis='index')
+ raise ValueError("cannot compute isin with " "a duplicate axis.")
+ return self.eq(values.reindex_like(self), axis="index")
elif isinstance(values, DataFrame):
if not (values.columns.is_unique and values.index.is_unique):
- raise ValueError("cannot compute isin with "
- "a duplicate axis.")
+ raise ValueError("cannot compute isin with " "a duplicate axis.")
return self.eq(values.reindex_like(self))
else:
if not is_list_like(values):
- raise TypeError("only list-like or dict-like objects are "
- "allowed to be passed to DataFrame.isin(), "
- "you passed a "
- "{0!r}".format(type(values).__name__))
+ raise TypeError(
+ "only list-like or dict-like objects are "
+ "allowed to be passed to DataFrame.isin(), "
+ "you passed a "
+ "{0!r}".format(type(values).__name__)
+ )
return DataFrame(
- algorithms.isin(self.values.ravel(),
- values).reshape(self.shape), self.index,
- self.columns)
+ algorithms.isin(self.values.ravel(), values).reshape(self.shape),
+ self.index,
+ self.columns,
+ )
# ----------------------------------------------------------------------
# Add plotting methods to DataFrame
@@ -7955,11 +8393,17 @@ def isin(self, values):
sparse = CachedAccessor("sparse", SparseFrameAccessor)
-DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
- axes_are_reversed=True, aliases={'rows': 0},
- docs={
- 'index': 'The index (row labels) of the DataFrame.',
- 'columns': 'The column labels of the DataFrame.'})
+DataFrame._setup_axes(
+ ["index", "columns"],
+ info_axis=1,
+ stat_axis=0,
+ axes_are_reversed=True,
+ aliases={"rows": 0},
+ docs={
+ "index": "The index (row labels) of the DataFrame.",
+ "columns": "The column labels of the DataFrame.",
+ },
+)
DataFrame._add_numeric_operations()
DataFrame._add_series_or_dataframe_operations()
@@ -7978,4 +8422,4 @@ def _from_nested_dict(data):
def _put_str(s, space):
- return '{s}'.format(s=s)[:space].ljust(space)
+ return "{s}".format(s=s)[:space].ljust(space)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 106af6e565f8a..4e9f74162ae78 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -19,17 +19,32 @@
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
-from pandas.util._decorators import (
- Appender, Substitution, rewrite_axis_style_signature)
+from pandas.util._decorators import Appender, Substitution, rewrite_axis_style_signature
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
from pandas.core.dtypes.common import (
- ensure_int64, ensure_object, ensure_str, is_bool, is_bool_dtype,
- is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
- is_dict_like, is_extension_array_dtype, is_integer, is_list_like,
- is_number, is_numeric_dtype, is_object_dtype, is_period_arraylike,
- is_re_compilable, is_scalar, is_timedelta64_dtype, pandas_dtype)
+ ensure_int64,
+ ensure_object,
+ ensure_str,
+ is_bool,
+ is_bool_dtype,
+ is_datetime64_any_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_dict_like,
+ is_extension_array_dtype,
+ is_integer,
+ is_list_like,
+ is_number,
+ is_numeric_dtype,
+ is_object_dtype,
+ is_period_arraylike,
+ is_re_compilable,
+ is_scalar,
+ is_timedelta64_dtype,
+ pandas_dtype,
+)
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
from pandas.core.dtypes.inference import is_hashable
from pandas.core.dtypes.missing import isna, notna
@@ -41,7 +56,12 @@
from pandas.core.base import PandasObject, SelectionMixin
import pandas.core.common as com
from pandas.core.index import (
- Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index)
+ Index,
+ InvalidIndexError,
+ MultiIndex,
+ RangeIndex,
+ ensure_index,
+)
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.period import Period, PeriodIndex
import pandas.core.indexing as indexing
@@ -56,12 +76,14 @@
# able to share
_shared_docs = dict()
_shared_doc_kwargs = dict(
- axes='keywords for axes', klass='Series/DataFrame',
- axes_single_arg='int or labels for object',
- args_transpose='axes to permute (int or label for object)',
+ axes="keywords for axes",
+ klass="Series/DataFrame",
+ axes_single_arg="int or labels for object",
+ args_transpose="axes to permute (int or label for object)",
optional_by="""
by : str or list of str
- Name or list of names to sort by""")
+ Name or list of names to sort by""",
+)
# sentinel value to use as kwarg in place of None when None has special meaning
# and needs to be distinguished from a user explicitly passing None.
@@ -74,8 +96,11 @@ def _single_replace(self, to_replace, method, inplace, limit):
replacement value is given in the replace method
"""
if self.ndim != 1:
- raise TypeError('cannot replace {0} with method {1} on a {2}'
- .format(to_replace, method, type(self).__name__))
+ raise TypeError(
+ "cannot replace {0} with method {1} on a {2}".format(
+ to_replace, method, type(self).__name__
+ )
+ )
orig_dtype = self.dtype
result = self if inplace else self.copy()
@@ -87,8 +112,7 @@ def _single_replace(self, to_replace, method, inplace, limit):
if values.dtype == orig_dtype and inplace:
return
- result = pd.Series(values, index=self.index,
- dtype=self.dtype).__finalize__(self)
+ result = pd.Series(values, index=self.index, dtype=self.dtype).__finalize__(self)
if inplace:
self._update_inplace(result._data)
@@ -108,15 +132,27 @@ class NDFrame(PandasObject, SelectionMixin):
axes : list
copy : boolean, default False
"""
- _internal_names = ['_data', '_cacher', '_item_cache', '_cache', '_is_copy',
- '_subtyp', '_name', '_index', '_default_kind',
- '_default_fill_value', '_metadata', '__array_struct__',
- '__array_interface__'] # type: List[str]
+
+ _internal_names = [
+ "_data",
+ "_cacher",
+ "_item_cache",
+ "_cache",
+ "_is_copy",
+ "_subtyp",
+ "_name",
+ "_index",
+ "_default_kind",
+ "_default_fill_value",
+ "_metadata",
+ "__array_struct__",
+ "__array_interface__",
+ ] # type: List[str]
_internal_names_set = set(_internal_names) # type: Set[str]
_accessors = set() # type: Set[str]
- _deprecations = frozenset([
- 'as_blocks', 'blocks', 'is_copy'
- ]) # type: FrozenSet[str]
+ _deprecations = frozenset(
+ ["as_blocks", "blocks", "is_copy"]
+ ) # type: FrozenSet[str]
_metadata = [] # type: List[str]
_is_copy = None
_data = None # type: BlockManager
@@ -124,12 +160,14 @@ class NDFrame(PandasObject, SelectionMixin):
# ----------------------------------------------------------------------
# Constructors
- def __init__(self,
- data: BlockManager,
- axes: Optional[List[Index]] = None,
- copy: bool = False,
- dtype: Optional[Dtype] = None,
- fastpath: bool = False):
+ def __init__(
+ self,
+ data: BlockManager,
+ axes: Optional[List[Index]] = None,
+ copy: bool = False,
+ dtype: Optional[Dtype] = None,
+ fastpath: bool = False,
+ ):
if not fastpath:
if dtype is not None:
@@ -141,17 +179,17 @@ def __init__(self,
for i, ax in enumerate(axes):
data = data.reindex_axis(ax, axis=i)
- object.__setattr__(self, '_is_copy', None)
- object.__setattr__(self, '_data', data)
- object.__setattr__(self, '_item_cache', {})
+ object.__setattr__(self, "_is_copy", None)
+ object.__setattr__(self, "_data", data)
+ object.__setattr__(self, "_item_cache", {})
def _init_mgr(self, mgr, axes=None, dtype=None, copy=False):
""" passed a manager and a axes dict """
for a, axe in axes.items():
if axe is not None:
- mgr = mgr.reindex_axis(axe,
- axis=self._get_block_manager_axis(a),
- copy=False)
+ mgr = mgr.reindex_axis(
+ axe, axis=self._get_block_manager_axis(a), copy=False
+ )
# make a copy if explicitly requested
if copy:
@@ -169,14 +207,22 @@ def is_copy(self):
"""
Return the copy.
"""
- warnings.warn("Attribute 'is_copy' is deprecated and will be removed "
- "in a future version.", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "Attribute 'is_copy' is deprecated and will be removed "
+ "in a future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._is_copy
@is_copy.setter
def is_copy(self, msg):
- warnings.warn("Attribute 'is_copy' is deprecated and will be removed "
- "in a future version.", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "Attribute 'is_copy' is deprecated and will be removed "
+ "in a future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
self._is_copy = msg
def _validate_dtype(self, dtype):
@@ -186,10 +232,11 @@ def _validate_dtype(self, dtype):
dtype = pandas_dtype(dtype)
# a compound dtype
- if dtype.kind == 'V':
- raise NotImplementedError("compound dtypes are not implemented"
- " in the {0} constructor"
- .format(self.__class__.__name__))
+ if dtype.kind == "V":
+ raise NotImplementedError(
+ "compound dtypes are not implemented"
+ " in the {0} constructor".format(self.__class__.__name__)
+ )
return dtype
@@ -221,9 +268,18 @@ def _constructor_expanddim(self):
# Axis
@classmethod
- def _setup_axes(cls, axes, info_axis=None, stat_axis=None, aliases=None,
- slicers=None, axes_are_reversed=False, build_axes=True,
- ns=None, docs=None):
+ def _setup_axes(
+ cls,
+ axes,
+ info_axis=None,
+ stat_axis=None,
+ aliases=None,
+ slicers=None,
+ axes_are_reversed=False,
+ build_axes=True,
+ ns=None,
+ docs=None,
+ ):
"""Provide axes setup for the major PandasObjects.
Parameters
@@ -248,7 +304,7 @@ def _setup_axes(cls, axes, info_axis=None, stat_axis=None, aliases=None,
cls._AXIS_REVERSED = axes_are_reversed
# typ
- setattr(cls, '_typ', cls.__name__.lower())
+ setattr(cls, "_typ", cls.__name__.lower())
# indexing support
cls._ix = None
@@ -293,13 +349,16 @@ def _construct_axes_dict_from(self, axes, **kwargs):
def _construct_axes_dict_for_slice(self, axes=None, **kwargs):
"""Return an axes dictionary for myself."""
- d = {self._AXIS_SLICEMAP[a]: self._get_axis(a)
- for a in (axes or self._AXIS_ORDERS)}
+ d = {
+ self._AXIS_SLICEMAP[a]: self._get_axis(a)
+ for a in (axes or self._AXIS_ORDERS)
+ }
d.update(kwargs)
return d
def _construct_axes_from_arguments(
- self, args, kwargs, require_all=False, sentinel=None):
+ self, args, kwargs, require_all=False, sentinel=None
+ ):
"""Construct and returns axes if supplied in args/kwargs.
If require_all, raise if all axis arguments are not supplied
@@ -319,8 +378,10 @@ def _construct_axes_from_arguments(
if alias is not None:
if a in kwargs:
if alias in kwargs:
- raise TypeError("arguments are mutually exclusive "
- "for [%s,%s]" % (a, alias))
+ raise TypeError(
+ "arguments are mutually exclusive "
+ "for [%s,%s]" % (a, alias)
+ )
continue
if alias in kwargs:
kwargs[a] = kwargs.pop(alias)
@@ -332,8 +393,7 @@ def _construct_axes_from_arguments(
kwargs[a] = args.pop(0)
except IndexError:
if require_all:
- raise TypeError("not enough/duplicate arguments "
- "specified!")
+ raise TypeError("not enough/duplicate arguments " "specified!")
axes = {a: kwargs.pop(a, sentinel) for a in self._AXIS_ORDERS}
return axes, kwargs
@@ -361,8 +421,7 @@ def _get_axis_number(cls, axis):
return cls._AXIS_NUMBERS[axis]
except KeyError:
pass
- raise ValueError('No axis named {0} for object type {1}'
- .format(axis, cls))
+ raise ValueError("No axis named {0} for object type {1}".format(axis, cls))
@classmethod
def _get_axis_name(cls, axis):
@@ -375,8 +434,7 @@ def _get_axis_name(cls, axis):
return cls._AXIS_NAMES[axis]
except KeyError:
pass
- raise ValueError('No axis named {0} for object type {1}'
- .format(axis, cls))
+ raise ValueError("No axis named {0} for object type {1}".format(axis, cls))
def _get_axis(self, axis):
name = self._get_axis_name(axis)
@@ -404,7 +462,7 @@ def _get_axis_resolvers(self, axis):
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
- key = '{prefix}level_{i}'.format(prefix=prefix, i=i)
+ key = "{prefix}level_{i}".format(prefix=prefix, i=i)
level = i
level_values = axis_index.get_level_values(level)
@@ -436,8 +494,7 @@ def _get_space_character_free_column_resolvers(self):
"""
from pandas.core.computation.common import _remove_spaces_column_name
- return {_remove_spaces_column_name(k): v for k, v
- in self.iteritems()}
+ return {_remove_spaces_column_name(k): v for k, v in self.iteritems()}
@property
def _info_axis(self):
@@ -525,7 +582,7 @@ def _expand_axes(self, key):
for k, ax in zip(key, self.axes):
if k not in ax:
if type(k) != ax.dtype.type:
- ax = ax.astype('O')
+ ax = ax.astype("O")
new_axes.append(ax.insert(len(ax), k))
else:
new_axes.append(ax)
@@ -631,17 +688,21 @@ def set_axis(self, labels, axis=0, inplace=None):
warnings.warn(
'set_axis now takes "labels" as first argument, and '
'"axis" as named parameter. The old form, with "axis" as '
- 'first parameter and \"labels\" as second, is still supported '
- 'but will be deprecated in a future version of pandas.',
- FutureWarning, stacklevel=2)
+ 'first parameter and "labels" as second, is still supported '
+ "but will be deprecated in a future version of pandas.",
+ FutureWarning,
+ stacklevel=2,
+ )
labels, axis = axis, labels
if inplace is None:
warnings.warn(
- 'set_axis currently defaults to operating inplace.\nThis '
- 'will change in a future version of pandas, use '
- 'inplace=True to avoid this warning.',
- FutureWarning, stacklevel=2)
+ "set_axis currently defaults to operating inplace.\nThis "
+ "will change in a future version of pandas, use "
+ "inplace=True to avoid this warning.",
+ FutureWarning,
+ stacklevel=2,
+ )
inplace = True
if inplace:
setattr(self, self._get_axis_name(axis), labels)
@@ -678,21 +739,21 @@ def transpose(self, *args, **kwargs):
"""
# construct the args
- axes, kwargs = self._construct_axes_from_arguments(args, kwargs,
- require_all=True)
- axes_names = tuple(self._get_axis_name(axes[a])
- for a in self._AXIS_ORDERS)
- axes_numbers = tuple(self._get_axis_number(axes[a])
- for a in self._AXIS_ORDERS)
+ axes, kwargs = self._construct_axes_from_arguments(
+ args, kwargs, require_all=True
+ )
+ axes_names = tuple(self._get_axis_name(axes[a]) for a in self._AXIS_ORDERS)
+ axes_numbers = tuple(self._get_axis_number(axes[a]) for a in self._AXIS_ORDERS)
# we must have unique axes
if len(axes) != len(set(axes)):
- raise ValueError('Must specify %s unique axes' % self._AXIS_LEN)
+ raise ValueError("Must specify %s unique axes" % self._AXIS_LEN)
- new_axes = self._construct_axes_dict_from(self, [self._get_axis(x)
- for x in axes_names])
+ new_axes = self._construct_axes_dict_from(
+ self, [self._get_axis(x) for x in axes_names]
+ )
new_values = self.values.transpose(axes_numbers)
- if kwargs.pop('copy', None) or (len(args) and args[-1]):
+ if kwargs.pop("copy", None) or (len(args) and args[-1]):
new_values = new_values.copy()
nv.validate_transpose(tuple(), kwargs)
@@ -716,8 +777,7 @@ def swapaxes(self, axis1, axis2, copy=True):
mapping = {i: j, j: i}
- new_axes = (self._get_axis(mapping.get(k, k))
- for k in range(self._AXIS_LEN))
+ new_axes = (self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN))
new_values = self.values.swapaxes(i, j)
if copy:
new_values = new_values.copy()
@@ -938,12 +998,14 @@ def squeeze(self, axis=None):
>>> df_0a.squeeze()
1
"""
- axis = (self._AXIS_NAMES if axis is None else
- (self._get_axis_number(axis),))
+ axis = self._AXIS_NAMES if axis is None else (self._get_axis_number(axis),)
try:
return self.iloc[
- tuple(0 if i in axis and len(a) == 1 else slice(None)
- for i, a in enumerate(self.axes))]
+ tuple(
+ 0 if i in axis and len(a) == 1 else slice(None)
+ for i, a in enumerate(self.axes)
+ )
+ ]
except Exception:
return self
@@ -1088,21 +1150,23 @@ def rename(self, *args, **kwargs):
See the :ref:`user guide ` for more.
"""
axes, kwargs = self._construct_axes_from_arguments(args, kwargs)
- copy = kwargs.pop('copy', True)
- inplace = kwargs.pop('inplace', False)
- level = kwargs.pop('level', None)
- axis = kwargs.pop('axis', None)
- errors = kwargs.pop('errors', 'ignore')
+ copy = kwargs.pop("copy", True)
+ inplace = kwargs.pop("inplace", False)
+ level = kwargs.pop("level", None)
+ axis = kwargs.pop("axis", None)
+ errors = kwargs.pop("errors", "ignore")
if axis is not None:
# Validate the axis
self._get_axis_number(axis)
if kwargs:
- raise TypeError('rename() got an unexpected keyword '
- 'argument "{0}"'.format(list(kwargs.keys())[0]))
+ raise TypeError(
+ "rename() got an unexpected keyword "
+ 'argument "{0}"'.format(list(kwargs.keys())[0])
+ )
if com.count_not_none(*axes.values()) == 0:
- raise TypeError('must pass an index to rename')
+ raise TypeError("must pass an index to rename")
self._consolidate_inplace()
result = self if inplace else self.copy(deep=copy)
@@ -1120,14 +1184,15 @@ def rename(self, *args, **kwargs):
# GH 13473
if not callable(v):
indexer = self.axes[axis].get_indexer_for(v)
- if errors == 'raise' and len(indexer[indexer == -1]):
- missing_labels = [label for index, label in enumerate(v)
- if indexer[index] == -1]
- raise KeyError('{} not found in axis'
- .format(missing_labels))
-
- result._data = result._data.rename_axis(f, axis=baxis, copy=copy,
- level=level)
+ if errors == "raise" and len(indexer[indexer == -1]):
+ missing_labels = [
+ label for index, label in enumerate(v) if indexer[index] == -1
+ ]
+ raise KeyError("{} not found in axis".format(missing_labels))
+
+ result._data = result._data.rename_axis(
+ f, axis=baxis, copy=copy, level=level
+ )
result._clear_item_cache()
if inplace:
@@ -1135,8 +1200,7 @@ def rename(self, *args, **kwargs):
else:
return result.__finalize__(self)
- @rewrite_axis_style_signature('mapper', [('copy', True),
- ('inplace', False)])
+ @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)])
def rename_axis(self, mapper=sentinel, **kwargs):
"""
Set the name of the axis for the index or columns.
@@ -1262,28 +1326,31 @@ class name
monkey 2 2
"""
axes, kwargs = self._construct_axes_from_arguments(
- (), kwargs, sentinel=sentinel)
- copy = kwargs.pop('copy', True)
- inplace = kwargs.pop('inplace', False)
- axis = kwargs.pop('axis', 0)
+ (), kwargs, sentinel=sentinel
+ )
+ copy = kwargs.pop("copy", True)
+ inplace = kwargs.pop("inplace", False)
+ axis = kwargs.pop("axis", 0)
if axis is not None:
axis = self._get_axis_number(axis)
if kwargs:
- raise TypeError('rename_axis() got an unexpected keyword '
- 'argument "{0}"'.format(list(kwargs.keys())[0]))
+ raise TypeError(
+ "rename_axis() got an unexpected keyword "
+ 'argument "{0}"'.format(list(kwargs.keys())[0])
+ )
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if mapper is not sentinel:
# Use v0.23 behavior if a scalar or list
- non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not
- is_dict_like(mapper))
+ non_mapper = is_scalar(mapper) or (
+ is_list_like(mapper) and not is_dict_like(mapper)
+ )
if non_mapper:
return self._set_axis_name(mapper, axis=axis, inplace=inplace)
else:
- raise ValueError("Use `.rename` to alter labels "
- "with a mapper.")
+ raise ValueError("Use `.rename` to alter labels " "with a mapper.")
else:
# Use new behavior. Means that index and/or columns
# is specified
@@ -1293,16 +1360,14 @@ class name
v = axes.get(self._AXIS_NAMES[axis])
if v is sentinel:
continue
- non_mapper = is_scalar(v) or (is_list_like(v) and not
- is_dict_like(v))
+ non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))
if non_mapper:
newnames = v
else:
f = com._get_rename_function(v)
curnames = self._get_axis(axis).names
newnames = [f(name) for name in curnames]
- result._set_axis_name(newnames, axis=axis,
- inplace=True)
+ result._set_axis_name(newnames, axis=axis, inplace=True)
if not inplace:
return result
@@ -1361,7 +1426,7 @@ def _set_axis_name(self, name, axis=0, inplace=False):
axis = self._get_axis_number(axis)
idx = self._get_axis(axis).set_names(name)
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
renamed = self if inplace else self.copy()
renamed.set_axis(idx, axis=axis, inplace=True)
if not inplace:
@@ -1371,8 +1436,9 @@ def _set_axis_name(self, name, axis=0, inplace=False):
# Comparison Methods
def _indexed_same(self, other):
- return all(self._get_axis(a).equals(other._get_axis(a))
- for a in self._AXIS_ORDERS)
+ return all(
+ self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS
+ )
def equals(self, other):
"""
@@ -1467,24 +1533,32 @@ def __neg__(self):
values = com.values_from_object(self)
if is_bool_dtype(values):
arr = operator.inv(values)
- elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)
- or is_object_dtype(values)):
+ elif (
+ is_numeric_dtype(values)
+ or is_timedelta64_dtype(values)
+ or is_object_dtype(values)
+ ):
arr = operator.neg(values)
else:
- raise TypeError("Unary negative expects numeric dtype, not {}"
- .format(values.dtype))
+ raise TypeError(
+ "Unary negative expects numeric dtype, not {}".format(values.dtype)
+ )
return self.__array_wrap__(arr)
def __pos__(self):
values = com.values_from_object(self)
- if (is_bool_dtype(values) or is_period_arraylike(values)):
+ if is_bool_dtype(values) or is_period_arraylike(values):
arr = values
- elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)
- or is_object_dtype(values)):
+ elif (
+ is_numeric_dtype(values)
+ or is_timedelta64_dtype(values)
+ or is_object_dtype(values)
+ ):
arr = operator.pos(values)
else:
- raise TypeError("Unary plus expects numeric dtype, not {}"
- .format(values.dtype))
+ raise TypeError(
+ "Unary plus expects numeric dtype, not {}".format(values.dtype)
+ )
return self.__array_wrap__(arr)
def __invert__(self):
@@ -1500,9 +1574,12 @@ def __invert__(self):
raise
def __nonzero__(self):
- raise ValueError("The truth value of a {0} is ambiguous. "
- "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
- .format(self.__class__.__name__))
+ raise ValueError(
+ "The truth value of a {0} is ambiguous. "
+ "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format(
+ self.__class__.__name__
+ )
+ )
__bool__ = __nonzero__
@@ -1523,8 +1600,10 @@ def bool(self):
if isinstance(v, (bool, np.bool_)):
return bool(v)
elif is_scalar(v):
- raise ValueError("bool cannot act on a non-boolean single element "
- "{0}".format(self.__class__.__name__))
+ raise ValueError(
+ "bool cannot act on a non-boolean single element "
+ "{0}".format(self.__class__.__name__)
+ )
self.__nonzero__()
@@ -1565,10 +1644,12 @@ def _is_level_reference(self, key, axis=0):
"""
axis = self._get_axis_number(axis)
- return (key is not None and
- is_hashable(key) and
- key in self.axes[axis].names and
- not self._is_label_reference(key, axis=axis))
+ return (
+ key is not None
+ and is_hashable(key)
+ and key in self.axes[axis].names
+ and not self._is_label_reference(key, axis=axis)
+ )
def _is_label_reference(self, key, axis=0):
"""
@@ -1593,9 +1674,11 @@ def _is_label_reference(self, key, axis=0):
axis = self._get_axis_number(axis)
other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)
- return (key is not None and
- is_hashable(key) and
- any(key in self.axes[ax] for ax in other_axes))
+ return (
+ key is not None
+ and is_hashable(key)
+ and any(key in self.axes[ax] for ax in other_axes)
+ )
def _is_label_or_level_reference(self, key, axis=0):
"""
@@ -1617,8 +1700,9 @@ def _is_label_or_level_reference(self, key, axis=0):
-------
is_label_or_level: bool
"""
- return (self._is_level_reference(key, axis=axis) or
- self._is_label_reference(key, axis=axis))
+ return self._is_level_reference(key, axis=axis) or self._is_label_reference(
+ key, axis=axis
+ )
def _check_label_or_level_ambiguity(self, key, axis=0):
"""
@@ -1641,27 +1725,32 @@ def _check_label_or_level_ambiguity(self, key, axis=0):
axis = self._get_axis_number(axis)
other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)
- if (key is not None and
- is_hashable(key) and
- key in self.axes[axis].names and
- any(key in self.axes[ax] for ax in other_axes)):
+ if (
+ key is not None
+ and is_hashable(key)
+ and key in self.axes[axis].names
+ and any(key in self.axes[ax] for ax in other_axes)
+ ):
# Build an informative and grammatical warning
- level_article, level_type = (('an', 'index')
- if axis == 0 else
- ('a', 'column'))
-
- label_article, label_type = (('a', 'column')
- if axis == 0 else
- ('an', 'index'))
-
- msg = ("'{key}' is both {level_article} {level_type} level and "
- "{label_article} {label_type} label, which is ambiguous."
- ).format(key=key,
- level_article=level_article,
- level_type=level_type,
- label_article=label_article,
- label_type=label_type)
+ level_article, level_type = (
+ ("an", "index") if axis == 0 else ("a", "column")
+ )
+
+ label_article, label_type = (
+ ("a", "column") if axis == 0 else ("an", "index")
+ )
+
+ msg = (
+ "'{key}' is both {level_article} {level_type} level and "
+ "{label_article} {label_type} label, which is ambiguous."
+ ).format(
+ key=key,
+ level_article=level_article,
+ level_type=level_type,
+ label_article=label_article,
+ label_type=label_type,
+ )
raise ValueError(msg)
def _get_label_or_level_values(self, key, axis=0):
@@ -1712,21 +1801,27 @@ def _get_label_or_level_values(self, key, axis=0):
# Check for duplicates
if values.ndim > 1:
- if other_axes and isinstance(
- self._get_axis(other_axes[0]), MultiIndex):
- multi_message = ('\n'
- 'For a multi-index, the label must be a '
- 'tuple with elements corresponding to '
- 'each level.')
+ if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
+ multi_message = (
+ "\n"
+ "For a multi-index, the label must be a "
+ "tuple with elements corresponding to "
+ "each level."
+ )
else:
- multi_message = ''
-
- label_axis_name = 'column' if axis == 0 else 'index'
- raise ValueError(("The {label_axis_name} label '{key}' "
- "is not unique.{multi_message}")
- .format(key=key,
- label_axis_name=label_axis_name,
- multi_message=multi_message))
+ multi_message = ""
+
+ label_axis_name = "column" if axis == 0 else "index"
+ raise ValueError(
+ (
+ "The {label_axis_name} label '{key}' "
+ "is not unique.{multi_message}"
+ ).format(
+ key=key,
+ label_axis_name=label_axis_name,
+ multi_message=multi_message,
+ )
+ )
return values
@@ -1760,21 +1855,22 @@ def _drop_labels_or_levels(self, keys, axis=0):
# Validate keys
keys = com.maybe_make_list(keys)
- invalid_keys = [k for k in keys if not
- self._is_label_or_level_reference(k, axis=axis)]
+ invalid_keys = [
+ k for k in keys if not self._is_label_or_level_reference(k, axis=axis)
+ ]
if invalid_keys:
- raise ValueError(("The following keys are not valid labels or "
- "levels for axis {axis}: {invalid_keys}")
- .format(axis=axis,
- invalid_keys=invalid_keys))
+ raise ValueError(
+ (
+ "The following keys are not valid labels or "
+ "levels for axis {axis}: {invalid_keys}"
+ ).format(axis=axis, invalid_keys=invalid_keys)
+ )
# Compute levels and labels to drop
- levels_to_drop = [k for k in keys
- if self._is_level_reference(k, axis=axis)]
+ levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)]
- labels_to_drop = [k for k in keys
- if not self._is_level_reference(k, axis=axis)]
+ labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)]
# Perform copy upfront and then use inplace operations below.
# This ensures that we always perform exactly one copy.
@@ -1810,8 +1906,10 @@ def _drop_labels_or_levels(self, keys, axis=0):
# Iteration
def __hash__(self):
- raise TypeError('{0!r} objects are mutable, thus they cannot be'
- ' hashed'.format(self.__class__.__name__))
+ raise TypeError(
+ "{0!r} objects are mutable, thus they cannot be"
+ " hashed".format(self.__class__.__name__)
+ )
def __iter__(self):
"""
@@ -1937,9 +2035,12 @@ def to_dense(self):
%(klass)s
Dense %(klass)s.
"""
- warnings.warn("DataFrame/Series.to_dense is deprecated "
- "and will be removed in a future version",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "DataFrame/Series.to_dense is deprecated "
+ "and will be removed in a future version",
+ FutureWarning,
+ stacklevel=2,
+ )
# compat
return self
@@ -1948,15 +2049,14 @@ def to_dense(self):
def __getstate__(self):
meta = {k: getattr(self, k, None) for k in self._metadata}
- return dict(_data=self._data, _typ=self._typ, _metadata=self._metadata,
- **meta)
+ return dict(_data=self._data, _typ=self._typ, _metadata=self._metadata, **meta)
def __setstate__(self, state):
if isinstance(state, BlockManager):
self._data = state
elif isinstance(state, dict):
- typ = state.get('_typ')
+ typ = state.get("_typ")
if typ is not None:
# set in the order of internal names
@@ -1996,15 +2096,15 @@ def __setstate__(self, state):
def __repr__(self):
# string representation based upon iterating over self
# (since, by definition, `PandasContainers` are iterable)
- prepr = '[%s]' % ','.join(map(pprint_thing, self))
- return '%s(%s)' % (self.__class__.__name__, prepr)
+ prepr = "[%s]" % ",".join(map(pprint_thing, self))
+ return "%s(%s)" % (self.__class__.__name__, prepr)
def _repr_latex_(self):
"""
Returns a LaTeX representation for a particular object.
Mainly for use with nbconvert (jupyter notebook conversion to pdf).
"""
- if config.get_option('display.latex.repr'):
+ if config.get_option("display.latex.repr"):
return self.to_latex()
else:
return None
@@ -2015,15 +2115,18 @@ def _repr_data_resource_(self):
naming convention.
"""
if config.get_option("display.html.table_schema"):
- data = self.head(config.get_option('display.max_rows'))
- payload = json.loads(data.to_json(orient='table'),
- object_pairs_hook=collections.OrderedDict)
+ data = self.head(config.get_option("display.max_rows"))
+ payload = json.loads(
+ data.to_json(orient="table"), object_pairs_hook=collections.OrderedDict
+ )
return payload
# ----------------------------------------------------------------------
# I/O Methods
- _shared_docs['to_excel'] = """
+ _shared_docs[
+ "to_excel"
+ ] = """
Write %(klass)s to an Excel sheet.
To write a single %(klass)s to an Excel .xlsx file it is only necessary to
@@ -2128,28 +2231,62 @@ def _repr_data_resource_(self):
"""
@Appender(_shared_docs["to_excel"] % dict(klass="object"))
- def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="",
- float_format=None, columns=None, header=True, index=True,
- index_label=None, startrow=0, startcol=0, engine=None,
- merge_cells=True, encoding=None, inf_rep="inf", verbose=True,
- freeze_panes=None):
+ def to_excel(
+ self,
+ excel_writer,
+ sheet_name="Sheet1",
+ na_rep="",
+ float_format=None,
+ columns=None,
+ header=True,
+ index=True,
+ index_label=None,
+ startrow=0,
+ startcol=0,
+ engine=None,
+ merge_cells=True,
+ encoding=None,
+ inf_rep="inf",
+ verbose=True,
+ freeze_panes=None,
+ ):
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
from pandas.io.formats.excel import ExcelFormatter
- formatter = ExcelFormatter(df, na_rep=na_rep, cols=columns,
- header=header,
- float_format=float_format, index=index,
- index_label=index_label,
- merge_cells=merge_cells,
- inf_rep=inf_rep)
- formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow,
- startcol=startcol, freeze_panes=freeze_panes,
- engine=engine)
-
- def to_json(self, path_or_buf=None, orient=None, date_format=None,
- double_precision=10, force_ascii=True, date_unit='ms',
- default_handler=None, lines=False, compression='infer',
- index=True):
+
+ formatter = ExcelFormatter(
+ df,
+ na_rep=na_rep,
+ cols=columns,
+ header=header,
+ float_format=float_format,
+ index=index,
+ index_label=index_label,
+ merge_cells=merge_cells,
+ inf_rep=inf_rep,
+ )
+ formatter.write(
+ excel_writer,
+ sheet_name=sheet_name,
+ startrow=startrow,
+ startcol=startcol,
+ freeze_panes=freeze_panes,
+ engine=engine,
+ )
+
+ def to_json(
+ self,
+ path_or_buf=None,
+ orient=None,
+ date_format=None,
+ double_precision=10,
+ force_ascii=True,
+ date_unit="ms",
+ default_handler=None,
+ lines=False,
+ compression="infer",
+ index=True,
+ ):
"""
Convert the object to a JSON string.
@@ -2286,17 +2423,24 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
"""
from pandas.io import json
- if date_format is None and orient == 'table':
- date_format = 'iso'
+
+ if date_format is None and orient == "table":
+ date_format = "iso"
elif date_format is None:
- date_format = 'epoch'
- return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient,
- date_format=date_format,
- double_precision=double_precision,
- force_ascii=force_ascii, date_unit=date_unit,
- default_handler=default_handler,
- lines=lines, compression=compression,
- index=index)
+ date_format = "epoch"
+ return json.to_json(
+ path_or_buf=path_or_buf,
+ obj=self,
+ orient=orient,
+ date_format=date_format,
+ double_precision=double_precision,
+ force_ascii=force_ascii,
+ date_unit=date_unit,
+ default_handler=default_handler,
+ lines=lines,
+ compression=compression,
+ index=index,
+ )
def to_hdf(self, path_or_buf, key, **kwargs):
"""
@@ -2400,9 +2544,10 @@ def to_hdf(self, path_or_buf, key, **kwargs):
>>> os.remove('data.h5')
"""
from pandas.io import pytables
+
pytables.to_hdf(path_or_buf, key, self, **kwargs)
- def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
+ def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs):
"""
Serialize object to input file path using msgpack format.
@@ -2429,11 +2574,21 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
"""
from pandas.io import packers
- return packers.to_msgpack(path_or_buf, self, encoding=encoding,
- **kwargs)
- def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
- index_label=None, chunksize=None, dtype=None, method=None):
+ return packers.to_msgpack(path_or_buf, self, encoding=encoding, **kwargs)
+
+ def to_sql(
+ self,
+ name,
+ con,
+ schema=None,
+ if_exists="fail",
+ index=True,
+ index_label=None,
+ chunksize=None,
+ dtype=None,
+ method=None,
+ ):
"""
Write records stored in a DataFrame to a SQL database.
@@ -2561,12 +2716,21 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
[(1,), (None,), (2,)]
"""
from pandas.io import sql
- sql.to_sql(self, name, con, schema=schema, if_exists=if_exists,
- index=index, index_label=index_label, chunksize=chunksize,
- dtype=dtype, method=method)
- def to_pickle(self, path, compression='infer',
- protocol=pickle.HIGHEST_PROTOCOL):
+ sql.to_sql(
+ self,
+ name,
+ con,
+ schema=schema,
+ if_exists=if_exists,
+ index=index,
+ index_label=index_label,
+ chunksize=chunksize,
+ dtype=dtype,
+ method=method,
+ )
+
+ def to_pickle(self, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL):
"""
Pickle (serialize) object to file.
@@ -2621,6 +2785,7 @@ def to_pickle(self, path, compression='infer',
>>> os.remove("./dummy.pkl")
"""
from pandas.io.pickle import to_pickle
+
to_pickle(self, path, compression=compression, protocol=protocol)
def to_clipboard(self, excel=True, sep=None, **kwargs):
@@ -2678,6 +2843,7 @@ def to_clipboard(self, excel=True, sep=None, **kwargs):
... # 4,5,6
"""
from pandas.io import clipboards
+
clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)
def to_xarray(self):
@@ -2762,12 +2928,28 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal'
else:
return xarray.Dataset.from_dataframe(self)
- def to_latex(self, buf=None, columns=None, col_space=None, header=True,
- index=True, na_rep='NaN', formatters=None, float_format=None,
- sparsify=None, index_names=True, bold_rows=False,
- column_format=None, longtable=None, escape=None,
- encoding=None, decimal='.', multicolumn=None,
- multicolumn_format=None, multirow=None):
+ def to_latex(
+ self,
+ buf=None,
+ columns=None,
+ col_space=None,
+ header=True,
+ index=True,
+ na_rep="NaN",
+ formatters=None,
+ float_format=None,
+ sparsify=None,
+ index_names=True,
+ bold_rows=False,
+ column_format=None,
+ longtable=None,
+ escape=None,
+ encoding=None,
+ decimal=".",
+ multicolumn=None,
+ multicolumn_format=None,
+ multirow=None,
+ ):
r"""
Render an object to a LaTeX tabular environment table.
@@ -2879,34 +3061,60 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True,
if multicolumn is None:
multicolumn = config.get_option("display.latex.multicolumn")
if multicolumn_format is None:
- multicolumn_format = config.get_option(
- "display.latex.multicolumn_format")
+ multicolumn_format = config.get_option("display.latex.multicolumn_format")
if multirow is None:
multirow = config.get_option("display.latex.multirow")
- formatter = DataFrameFormatter(self, buf=buf, columns=columns,
- col_space=col_space, na_rep=na_rep,
- header=header, index=index,
- formatters=formatters,
- float_format=float_format,
- bold_rows=bold_rows,
- sparsify=sparsify,
- index_names=index_names,
- escape=escape, decimal=decimal)
- formatter.to_latex(column_format=column_format, longtable=longtable,
- encoding=encoding, multicolumn=multicolumn,
- multicolumn_format=multicolumn_format,
- multirow=multirow)
+ formatter = DataFrameFormatter(
+ self,
+ buf=buf,
+ columns=columns,
+ col_space=col_space,
+ na_rep=na_rep,
+ header=header,
+ index=index,
+ formatters=formatters,
+ float_format=float_format,
+ bold_rows=bold_rows,
+ sparsify=sparsify,
+ index_names=index_names,
+ escape=escape,
+ decimal=decimal,
+ )
+ formatter.to_latex(
+ column_format=column_format,
+ longtable=longtable,
+ encoding=encoding,
+ multicolumn=multicolumn,
+ multicolumn_format=multicolumn_format,
+ multirow=multirow,
+ )
if buf is None:
return formatter.buf.getvalue()
- def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
- columns=None, header=True, index=True, index_label=None,
- mode='w', encoding=None, compression='infer', quoting=None,
- quotechar='"', line_terminator=None, chunksize=None,
- date_format=None, doublequote=True,
- escapechar=None, decimal='.'):
+ def to_csv(
+ self,
+ path_or_buf=None,
+ sep=",",
+ na_rep="",
+ float_format=None,
+ columns=None,
+ header=True,
+ index=True,
+ index_label=None,
+ mode="w",
+ encoding=None,
+ compression="infer",
+ quoting=None,
+ quotechar='"',
+ line_terminator=None,
+ chunksize=None,
+ date_format=None,
+ doublequote=True,
+ escapechar=None,
+ decimal=".",
+ ):
r"""
Write object to a comma-separated values (csv) file.
@@ -3012,17 +3220,29 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
from pandas.io.formats.csvs import CSVFormatter
- formatter = CSVFormatter(df, path_or_buf,
- line_terminator=line_terminator, sep=sep,
- encoding=encoding,
- compression=compression, quoting=quoting,
- na_rep=na_rep, float_format=float_format,
- cols=columns, header=header, index=index,
- index_label=index_label, mode=mode,
- chunksize=chunksize, quotechar=quotechar,
- date_format=date_format,
- doublequote=doublequote,
- escapechar=escapechar, decimal=decimal)
+
+ formatter = CSVFormatter(
+ df,
+ path_or_buf,
+ line_terminator=line_terminator,
+ sep=sep,
+ encoding=encoding,
+ compression=compression,
+ quoting=quoting,
+ na_rep=na_rep,
+ float_format=float_format,
+ cols=columns,
+ header=header,
+ index=index,
+ index_label=index_label,
+ mode=mode,
+ chunksize=chunksize,
+ quotechar=quotechar,
+ date_format=date_format,
+ doublequote=doublequote,
+ escapechar=escapechar,
+ decimal=decimal,
+ )
formatter.save()
if path_or_buf is None:
@@ -3082,7 +3302,7 @@ def _set_as_cached(self, item, cacher):
def _reset_cacher(self):
"""Reset the cacher."""
- if hasattr(self, '_cacher'):
+ if hasattr(self, "_cacher"):
del self._cacher
def _iget_item_cache(self, item):
@@ -3105,11 +3325,11 @@ def _maybe_cache_changed(self, item, value):
@property
def _is_cached(self):
"""Return boolean indicating if self is cached or not."""
- return getattr(self, '_cacher', None) is not None
+ return getattr(self, "_cacher", None) is not None
def _get_cacher(self):
"""return my cacher or None"""
- cacher = getattr(self, '_cacher', None)
+ cacher = getattr(self, "_cacher", None)
if cacher is not None:
cacher = cacher[1]()
return cacher
@@ -3133,7 +3353,7 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True):
"""
- cacher = getattr(self, '_cacher', None)
+ cacher = getattr(self, "_cacher", None)
if cacher is not None:
ref = cacher[1]()
@@ -3148,7 +3368,7 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True):
pass
if verify_is_copy:
- self._check_setitem_copy(stacklevel=5, t='referant')
+ self._check_setitem_copy(stacklevel=5, t="referant")
if clear:
self._clear_item_cache()
@@ -3202,14 +3422,13 @@ def _check_is_chained_assignment_possible(self):
if self._is_view and self._is_cached:
ref = self._get_cacher()
if ref is not None and ref._is_mixed_type:
- self._check_setitem_copy(stacklevel=4, t='referant',
- force=True)
+ self._check_setitem_copy(stacklevel=4, t="referant", force=True)
return True
elif self._is_copy:
- self._check_setitem_copy(stacklevel=4, t='referant')
+ self._check_setitem_copy(stacklevel=4, t="referant")
return False
- def _check_setitem_copy(self, stacklevel=4, t='setting', force=False):
+ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False):
"""
Parameters
@@ -3244,7 +3463,7 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False):
if not (force or self._is_copy):
return
- value = config.get_option('mode.chained_assignment')
+ value = config.get_option("mode.chained_assignment")
if value is None:
return
@@ -3260,30 +3479,31 @@ def _check_setitem_copy(self, stacklevel=4, t='setting', force=False):
if isinstance(self._is_copy, str):
t = self._is_copy
- elif t == 'referant':
- t = ("\n"
- "A value is trying to be set on a copy of a slice from a "
- "DataFrame\n\n"
- "See the caveats in the documentation: "
- "http://pandas.pydata.org/pandas-docs/stable/user_guide/"
- "indexing.html#returning-a-view-versus-a-copy"
- )
+ elif t == "referant":
+ t = (
+ "\n"
+ "A value is trying to be set on a copy of a slice from a "
+ "DataFrame\n\n"
+ "See the caveats in the documentation: "
+ "http://pandas.pydata.org/pandas-docs/stable/user_guide/"
+ "indexing.html#returning-a-view-versus-a-copy"
+ )
else:
- t = ("\n"
- "A value is trying to be set on a copy of a slice from a "
- "DataFrame.\n"
- "Try using .loc[row_indexer,col_indexer] = value "
- "instead\n\nSee the caveats in the documentation: "
- "http://pandas.pydata.org/pandas-docs/stable/user_guide/"
- "indexing.html#returning-a-view-versus-a-copy"
- )
-
- if value == 'raise':
+ t = (
+ "\n"
+ "A value is trying to be set on a copy of a slice from a "
+ "DataFrame.\n"
+ "Try using .loc[row_indexer,col_indexer] = value "
+ "instead\n\nSee the caveats in the documentation: "
+ "http://pandas.pydata.org/pandas-docs/stable/user_guide/"
+ "indexing.html#returning-a-view-versus-a-copy"
+ )
+
+ if value == "raise":
raise com.SettingWithCopyError(t)
- elif value == 'warn':
- warnings.warn(t, com.SettingWithCopyWarning,
- stacklevel=stacklevel)
+ elif value == "warn":
+ warnings.warn(t, com.SettingWithCopyWarning, stacklevel=stacklevel)
def __delitem__(self, key):
"""
@@ -3292,7 +3512,7 @@ def __delitem__(self, key):
deleted = False
maybe_shortcut = False
- if hasattr(self, 'columns') and isinstance(self.columns, MultiIndex):
+ if hasattr(self, "columns") and isinstance(self.columns, MultiIndex):
try:
maybe_shortcut = key not in self.columns._engine
except TypeError:
@@ -3302,9 +3522,9 @@ def __delitem__(self, key):
# Allow shorthand to delete all columns whose first len(key)
# elements match key:
if not isinstance(key, tuple):
- key = (key, )
+ key = (key,)
for col in self.columns:
- if isinstance(col, tuple) and col[:len(key)] == key:
+ if isinstance(col, tuple) and col[: len(key)] == key:
del self[col]
deleted = True
if not deleted:
@@ -3353,9 +3573,9 @@ def _take(self, indices, axis=0, is_copy=True):
"""
self._consolidate_inplace()
- new_data = self._data.take(indices,
- axis=self._get_block_manager_axis(axis),
- verify=True)
+ new_data = self._data.take(
+ indices, axis=self._get_block_manager_axis(axis), verify=True
+ )
result = self._constructor(new_data).__finalize__(self)
# Maybe set copy if we didn't actually change the index.
@@ -3545,8 +3765,7 @@ class animal locomotion
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
if level is not None:
- loc, new_ax = labels.get_loc_level(key, level=level,
- drop_level=drop_level)
+ loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
# create the tuple of the indexer
indexer = [slice(None)] * self.ndim
@@ -3564,8 +3783,7 @@ class animal locomotion
index = self.index
if isinstance(index, MultiIndex):
- loc, new_index = self.index.get_loc_level(key,
- drop_level=drop_level)
+ loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
else:
loc = self.index.get_loc(key)
@@ -3591,8 +3809,11 @@ class animal locomotion
return com.maybe_box_datetimelike(new_values)
result = self._constructor_sliced(
- new_values, index=self.columns,
- name=self.index[loc], dtype=new_values.dtype)
+ new_values,
+ index=self.columns,
+ name=self.index[loc],
+ dtype=new_values.dtype,
+ )
else:
result = self.iloc[loc]
@@ -3605,8 +3826,7 @@ class animal locomotion
_xs = xs # type: Callable
- def reindex_like(self, other, method=None, copy=True, limit=None,
- tolerance=None):
+ def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None):
"""
Return an object with matching indices as other object.
@@ -3701,28 +3921,42 @@ def reindex_like(self, other, method=None, copy=True, limit=None,
2014-02-14 NaN NaN NaN
2014-02-15 35.1 NaN medium
"""
- d = other._construct_axes_dict(axes=self._AXIS_ORDERS, method=method,
- copy=copy, limit=limit,
- tolerance=tolerance)
+ d = other._construct_axes_dict(
+ axes=self._AXIS_ORDERS,
+ method=method,
+ copy=copy,
+ limit=limit,
+ tolerance=tolerance,
+ )
return self.reindex(**d)
- def drop(self, labels=None, axis=0, index=None, columns=None, level=None,
- inplace=False, errors='raise'):
+ def drop(
+ self,
+ labels=None,
+ axis=0,
+ index=None,
+ columns=None,
+ level=None,
+ inplace=False,
+ errors="raise",
+ ):
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if labels is not None:
if index is not None or columns is not None:
- raise ValueError("Cannot specify both 'labels' and "
- "'index'/'columns'")
+ raise ValueError(
+ "Cannot specify both 'labels' and " "'index'/'columns'"
+ )
axis_name = self._get_axis_name(axis)
axes = {axis_name: labels}
elif index is not None or columns is not None:
axes, _ = self._construct_axes_from_arguments((index, columns), {})
else:
- raise ValueError("Need to specify at least one of 'labels', "
- "'index' or 'columns'")
+ raise ValueError(
+ "Need to specify at least one of 'labels', " "'index' or 'columns'"
+ )
obj = self
@@ -3735,7 +3969,7 @@ def drop(self, labels=None, axis=0, index=None, columns=None, level=None,
else:
return obj
- def _drop_axis(self, labels, axis, level=None, errors='raise'):
+ def _drop_axis(self, labels, axis, level=None, errors="raise"):
"""
Drop labels from specified axis. Used in the ``drop`` method
internally.
@@ -3757,7 +3991,7 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'):
if axis.is_unique:
if level is not None:
if not isinstance(axis, MultiIndex):
- raise AssertionError('axis must be a MultiIndex')
+ raise AssertionError("axis must be a MultiIndex")
new_axis = axis.drop(labels, level=level, errors=errors)
else:
new_axis = axis.drop(labels, errors=errors)
@@ -3768,18 +4002,18 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'):
labels = ensure_object(com.index_labels_to_array(labels))
if level is not None:
if not isinstance(axis, MultiIndex):
- raise AssertionError('axis must be a MultiIndex')
+ raise AssertionError("axis must be a MultiIndex")
indexer = ~axis.get_level_values(level).isin(labels)
# GH 18561 MultiIndex.drop should raise if label is absent
- if errors == 'raise' and indexer.all():
- raise KeyError('{} not found in axis'.format(labels))
+ if errors == "raise" and indexer.all():
+ raise KeyError("{} not found in axis".format(labels))
else:
indexer = ~axis.isin(labels)
# Check if label doesn't exist along axis
labels_missing = (axis.get_indexer_for(labels) == -1).any()
- if errors == 'raise' and labels_missing:
- raise KeyError('{} not found in axis'.format(labels))
+ if errors == "raise" and labels_missing:
+ raise KeyError("{} not found in axis".format(labels))
slicer = [slice(None)] * self.ndim
slicer[self._get_axis_number(axis_name)] = indexer
@@ -3803,7 +4037,7 @@ def _update_inplace(self, result, verify_is_copy=True):
self._reset_cache()
self._clear_item_cache()
- self._data = getattr(result, '_data', result)
+ self._data = getattr(result, "_data", result)
self._maybe_update_cacher(verify_is_copy=verify_is_copy)
def add_prefix(self, prefix):
@@ -3860,7 +4094,7 @@ def add_prefix(self, prefix):
2 3 5
3 4 6
"""
- f = functools.partial('{prefix}{}'.format, prefix=prefix)
+ f = functools.partial("{prefix}{}".format, prefix=prefix)
mapper = {self._info_axis_name: f}
return self.rename(**mapper)
@@ -3919,13 +4153,20 @@ def add_suffix(self, suffix):
2 3 5
3 4 6
"""
- f = functools.partial('{}{suffix}'.format, suffix=suffix)
+ f = functools.partial("{}{suffix}".format, suffix=suffix)
mapper = {self._info_axis_name: f}
return self.rename(**mapper)
- def sort_values(self, by=None, axis=0, ascending=True, inplace=False,
- kind='quicksort', na_position='last'):
+ def sort_values(
+ self,
+ by=None,
+ axis=0,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ ):
"""
Sort by the values along either axis.
@@ -4015,8 +4256,16 @@ def sort_values(self, by=None, axis=0, ascending=True, inplace=False,
"""
raise AbstractMethodError(self)
- def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
- kind='quicksort', na_position='last', sort_remaining=True):
+ def sort_index(
+ self,
+ axis=0,
+ level=None,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ sort_remaining=True,
+ ):
"""
Sort object by labels (along an axis).
@@ -4048,7 +4297,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
sorted_obj : DataFrame or None
DataFrame with sorted index if inplace=False, None otherwise.
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
axis = self._get_axis_number(axis)
axis_name = self._get_axis_name(axis)
labels = self._get_axis(axis)
@@ -4273,12 +4522,12 @@ def reindex(self, *args, **kwargs):
# construct the args
axes, kwargs = self._construct_axes_from_arguments(args, kwargs)
- method = missing.clean_reindex_fill_method(kwargs.pop('method', None))
- level = kwargs.pop('level', None)
- copy = kwargs.pop('copy', True)
- limit = kwargs.pop('limit', None)
- tolerance = kwargs.pop('tolerance', None)
- fill_value = kwargs.pop('fill_value', None)
+ method = missing.clean_reindex_fill_method(kwargs.pop("method", None))
+ level = kwargs.pop("level", None)
+ copy = kwargs.pop("copy", True)
+ limit = kwargs.pop("limit", None)
+ tolerance = kwargs.pop("tolerance", None)
+ fill_value = kwargs.pop("fill_value", None)
# Series.reindex doesn't use / need the axis kwarg
# We pop and ignore it here, to make writing Series/Frame generic code
@@ -4286,15 +4535,20 @@ def reindex(self, *args, **kwargs):
kwargs.pop("axis", None)
if kwargs:
- raise TypeError('reindex() got an unexpected keyword '
- 'argument "{0}"'.format(list(kwargs.keys())[0]))
+ raise TypeError(
+ "reindex() got an unexpected keyword "
+ 'argument "{0}"'.format(list(kwargs.keys())[0])
+ )
self._consolidate_inplace()
# if all axes that are requested to reindex are equal, then only copy
# if indicated must have index names equal here as well as values
- if all(self._get_axis(axis).identical(ax)
- for axis, ax in axes.items() if ax is not None):
+ if all(
+ self._get_axis(axis).identical(ax)
+ for axis, ax in axes.items()
+ if ax is not None
+ ):
if copy:
return self.copy()
return self
@@ -4307,11 +4561,11 @@ def reindex(self, *args, **kwargs):
pass
# perform the reindex on the axes
- return self._reindex_axes(axes, level, limit, tolerance, method,
- fill_value, copy).__finalize__(self)
+ return self._reindex_axes(
+ axes, level, limit, tolerance, method, fill_value, copy
+ ).__finalize__(self)
- def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
- copy):
+ def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
"""Perform the reindex for all the axes."""
obj = self
for a in self._AXIS_ORDERS:
@@ -4320,26 +4574,35 @@ def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
continue
ax = self._get_axis(a)
- new_index, indexer = ax.reindex(labels, level=level, limit=limit,
- tolerance=tolerance, method=method)
+ new_index, indexer = ax.reindex(
+ labels, level=level, limit=limit, tolerance=tolerance, method=method
+ )
axis = self._get_axis_number(a)
- obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
- fill_value=fill_value,
- copy=copy, allow_dups=False)
+ obj = obj._reindex_with_indexers(
+ {axis: [new_index, indexer]},
+ fill_value=fill_value,
+ copy=copy,
+ allow_dups=False,
+ )
return obj
def _needs_reindex_multi(self, axes, method, level):
"""Check if we do need a multi reindex."""
- return ((com.count_not_none(*axes.values()) == self._AXIS_LEN) and
- method is None and level is None and not self._is_mixed_type)
+ return (
+ (com.count_not_none(*axes.values()) == self._AXIS_LEN)
+ and method is None
+ and level is None
+ and not self._is_mixed_type
+ )
def _reindex_multi(self, axes, copy, fill_value):
return NotImplemented
- def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False,
- allow_dups=False):
+ def _reindex_with_indexers(
+ self, reindexers, fill_value=None, copy=False, allow_dups=False
+ ):
"""allow_dups indicates an internal call here """
# reindex doing multiple operations on different axes if indicated
@@ -4356,10 +4619,14 @@ def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False,
indexer = ensure_int64(indexer)
# TODO: speed up on homogeneous DataFrame objects
- new_data = new_data.reindex_indexer(index, indexer, axis=baxis,
- fill_value=fill_value,
- allow_dups=allow_dups,
- copy=copy)
+ new_data = new_data.reindex_indexer(
+ index,
+ indexer,
+ axis=baxis,
+ fill_value=fill_value,
+ allow_dups=allow_dups,
+ copy=copy,
+ )
if copy and new_data is self._data:
new_data = new_data.copy()
@@ -4429,8 +4696,10 @@ def filter(self, items=None, like=None, regex=None, axis=None):
nkw = com.count_not_none(items, like, regex)
if nkw > 1:
- raise TypeError('Keyword arguments `items`, `like`, or `regex` '
- 'are mutually exclusive')
+ raise TypeError(
+ "Keyword arguments `items`, `like`, or `regex` "
+ "are mutually exclusive"
+ )
if axis is None:
axis = self._info_axis_name
@@ -4438,21 +4707,24 @@ def filter(self, items=None, like=None, regex=None, axis=None):
if items is not None:
name = self._get_axis_name(axis)
- return self.reindex(
- **{name: [r for r in items if r in labels]})
+ return self.reindex(**{name: [r for r in items if r in labels]})
elif like:
+
def f(x):
return like in ensure_str(x)
+
values = labels.map(f)
return self.loc(axis=axis)[values]
elif regex:
+
def f(x):
return matcher.search(ensure_str(x)) is not None
+
matcher = re.compile(regex)
values = labels.map(f)
return self.loc(axis=axis)[values]
else:
- raise TypeError('Must pass either `items`, `like`, or `regex`')
+ raise TypeError("Must pass either `items`, `like`, or `regex`")
def head(self, n=5):
"""
@@ -4574,8 +4846,15 @@ def tail(self, n=5):
return self.iloc[0:0]
return self.iloc[-n:]
- def sample(self, n=None, frac=None, replace=False, weights=None,
- random_state=None, axis=None):
+ def sample(
+ self,
+ n=None,
+ frac=None,
+ replace=False,
+ weights=None,
+ random_state=None,
+ axis=None,
+ ):
"""
Return a random sample of items from an axis of object.
@@ -4683,28 +4962,33 @@ def sample(self, n=None, frac=None, replace=False, weights=None,
try:
weights = self[weights]
except KeyError:
- raise KeyError("String passed to weights not a "
- "valid column")
+ raise KeyError(
+ "String passed to weights not a " "valid column"
+ )
else:
- raise ValueError("Strings can only be passed to "
- "weights when sampling from rows on "
- "a DataFrame")
+ raise ValueError(
+ "Strings can only be passed to "
+ "weights when sampling from rows on "
+ "a DataFrame"
+ )
else:
- raise ValueError("Strings cannot be passed as weights "
- "when sampling from a Series.")
+ raise ValueError(
+ "Strings cannot be passed as weights "
+ "when sampling from a Series."
+ )
- weights = pd.Series(weights, dtype='float64')
+ weights = pd.Series(weights, dtype="float64")
if len(weights) != axis_length:
- raise ValueError("Weights and axis to be sampled must be of "
- "same length")
+ raise ValueError(
+ "Weights and axis to be sampled must be of " "same length"
+ )
if (weights == np.inf).any() or (weights == -np.inf).any():
raise ValueError("weight vector may not include `inf` values")
if (weights < 0).any():
- raise ValueError("weight vector many not include negative "
- "values")
+ raise ValueError("weight vector many not include negative " "values")
# If has nan, set to zero.
weights = weights.fillna(0)
@@ -4726,18 +5010,20 @@ def sample(self, n=None, frac=None, replace=False, weights=None,
elif n is None and frac is not None:
n = int(round(frac * axis_length))
elif n is not None and frac is not None:
- raise ValueError('Please enter a value for `frac` OR `n`, not '
- 'both')
+ raise ValueError("Please enter a value for `frac` OR `n`, not " "both")
# Check for negative sizes
if n < 0:
- raise ValueError("A negative number of rows requested. Please "
- "provide positive value.")
+ raise ValueError(
+ "A negative number of rows requested. Please " "provide positive value."
+ )
locs = rs.choice(axis_length, size=n, replace=replace, p=weights)
return self.take(locs, axis=axis, is_copy=False)
- _shared_docs['pipe'] = (r"""
+ _shared_docs[
+ "pipe"
+ ] = r"""
Apply func(self, \*args, \*\*kwargs).
Parameters
@@ -4786,13 +5072,14 @@ def sample(self, n=None, frac=None, replace=False, weights=None,
... .pipe(g, arg1=a)
... .pipe((f, 'arg2'), arg1=a, arg3=c)
... )
- """)
+ """
- @Appender(_shared_docs['pipe'] % _shared_doc_kwargs)
+ @Appender(_shared_docs["pipe"] % _shared_doc_kwargs)
def pipe(self, func, *args, **kwargs):
return com._pipe(self, func, *args, **kwargs)
- _shared_docs['aggregate'] = dedent("""
+ _shared_docs["aggregate"] = dedent(
+ """
Aggregate using one or more operations over the specified axis.
%(versionadded)s
Parameters
@@ -4830,9 +5117,12 @@ def pipe(self, func, *args, **kwargs):
`agg` is an alias for `aggregate`. Use the alias.
A passed user-defined-function will be passed a Series for evaluation.
- %(examples)s""")
+ %(examples)s"""
+ )
- _shared_docs['transform'] = ("""
+ _shared_docs[
+ "transform"
+ ] = """
Call ``func`` on self producing a %(klass)s with transformed values
and that has the same axis length as self.
@@ -4898,7 +5188,7 @@ def pipe(self, func, *args, **kwargs):
0 0.000000 1.000000
1 1.000000 2.718282
2 1.414214 7.389056
- """)
+ """
# ----------------------------------------------------------------------
# Attribute access
@@ -4928,8 +5218,11 @@ def __getattr__(self, name):
# Note: obj.x will always call obj.__getattribute__('x') prior to
# calling obj.__getattr__('x').
- if (name in self._internal_names_set or name in self._metadata or
- name in self._accessors):
+ if (
+ name in self._internal_names_set
+ or name in self._metadata
+ or name in self._accessors
+ ):
return object.__getattribute__(self, name)
else:
if self._info_axis._can_hold_identifiers_and_holds_name(name):
@@ -4968,19 +5261,24 @@ def __setattr__(self, name, value):
object.__setattr__(self, name, value)
except (AttributeError, TypeError):
if isinstance(self, ABCDataFrame) and (is_list_like(value)):
- warnings.warn("Pandas doesn't allow columns to be "
- "created via a new attribute name - see "
- "https://pandas.pydata.org/pandas-docs/"
- "stable/indexing.html#attribute-access",
- stacklevel=2)
+ warnings.warn(
+ "Pandas doesn't allow columns to be "
+ "created via a new attribute name - see "
+ "https://pandas.pydata.org/pandas-docs/"
+ "stable/indexing.html#attribute-access",
+ stacklevel=2,
+ )
object.__setattr__(self, name, value)
def _dir_additions(self):
""" add the string-like attributes from the info_axis.
If info_axis is a MultiIndex, it's first level values are used.
"""
- additions = {c for c in self._info_axis.unique(level=0)[:100]
- if isinstance(c, str) and c.isidentifier()}
+ additions = {
+ c
+ for c in self._info_axis.unique(level=0)[:100]
+ if isinstance(c, str) and c.isidentifier()
+ }
return super()._dir_additions().union(additions)
# ----------------------------------------------------------------------
@@ -5021,7 +5319,7 @@ def _consolidate(self, inplace=False):
-------
consolidated : same type as caller
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if inplace:
self._consolidate_inplace()
else:
@@ -5057,14 +5355,15 @@ def _check_inplace_setting(self, value):
except Exception:
pass
- raise TypeError('Cannot do inplace boolean setting on '
- 'mixed-types with a non np.nan value')
+ raise TypeError(
+ "Cannot do inplace boolean setting on "
+ "mixed-types with a non np.nan value"
+ )
return True
def _get_numeric_data(self):
- return self._constructor(
- self._data.get_numeric_data()).__finalize__(self)
+ return self._constructor(self._data.get_numeric_data()).__finalize__(self)
def _get_bool_data(self):
return self._constructor(self._data.get_bool_data()).__finalize__(self)
@@ -5111,11 +5410,14 @@ def as_matrix(self, columns=None):
This method is provided for backwards compatibility. Generally,
it is recommended to use '.values'.
"""
- warnings.warn("Method .as_matrix will be removed in a future version. "
- "Use .values instead.", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "Method .as_matrix will be removed in a future version. "
+ "Use .values instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
self._consolidate_inplace()
- return self._data.as_array(transpose=self._AXIS_REVERSED,
- items=columns)
+ return self._data.as_array(transpose=self._AXIS_REVERSED, items=columns)
@property
def values(self):
@@ -5253,7 +5555,9 @@ def get_values(self):
warnings.warn(
"The 'get_values' method is deprecated and will be removed in a "
"future version. Use '.values' or 'np.asarray(..)' instead.",
- FutureWarning, stacklevel=2)
+ FutureWarning,
+ stacklevel=2,
+ )
return self._internal_get_values()
def _internal_get_values(self):
@@ -5292,11 +5596,15 @@ def get_dtype_counts(self):
object 1
dtype: int64
"""
- warnings.warn("`get_dtype_counts` has been deprecated and will be "
- "removed in a future version. For DataFrames use "
- "`.dtypes.value_counts()", FutureWarning,
- stacklevel=2)
+ warnings.warn(
+ "`get_dtype_counts` has been deprecated and will be "
+ "removed in a future version. For DataFrames use "
+ "`.dtypes.value_counts()",
+ FutureWarning,
+ stacklevel=2,
+ )
from pandas import Series
+
return Series(self._data.get_dtype_counts())
def get_ftype_counts(self):
@@ -5335,11 +5643,14 @@ def get_ftype_counts(self):
object:dense 1
dtype: int64
"""
- warnings.warn("get_ftype_counts is deprecated and will "
- "be removed in a future version",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "get_ftype_counts is deprecated and will " "be removed in a future version",
+ FutureWarning,
+ stacklevel=2,
+ )
from pandas import Series
+
return Series(self._data.get_ftype_counts())
@property
@@ -5375,8 +5686,8 @@ def dtypes(self):
dtype: object
"""
from pandas import Series
- return Series(self._data.get_dtypes(), index=self._info_axis,
- dtype=np.object_)
+
+ return Series(self._data.get_dtypes(), index=self._info_axis, dtype=np.object_)
@property
def ftypes(self):
@@ -5423,14 +5734,17 @@ def ftypes(self):
3 float64:sparse
dtype: object
"""
- warnings.warn("DataFrame.ftypes is deprecated and will "
- "be removed in a future version. "
- "Use DataFrame.dtypes instead.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "DataFrame.ftypes is deprecated and will "
+ "be removed in a future version. "
+ "Use DataFrame.dtypes instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
from pandas import Series
- return Series(self._data.get_ftypes(), index=self._info_axis,
- dtype=np.object_)
+
+ return Series(self._data.get_ftypes(), index=self._info_axis, dtype=np.object_)
def as_blocks(self, copy=True):
"""
@@ -5450,9 +5764,11 @@ def as_blocks(self, copy=True):
-------
values : a dict of dtype -> Constructor Types
"""
- warnings.warn("as_blocks is deprecated and will "
- "be removed in a future version",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "as_blocks is deprecated and will " "be removed in a future version",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._to_dict_of_blocks(copy=copy)
@property
@@ -5471,10 +5787,12 @@ def _to_dict_of_blocks(self, copy=True):
Internal ONLY
"""
- return {k: self._constructor(v).__finalize__(self)
- for k, v, in self._data.to_dict(copy=copy).items()}
+ return {
+ k: self._constructor(v).__finalize__(self)
+ for k, v, in self._data.to_dict(copy=copy).items()
+ }
- def astype(self, dtype, copy=True, errors='raise', **kwargs):
+ def astype(self, dtype, copy=True, errors="raise", **kwargs):
"""
Cast a pandas object to a specified dtype ``dtype``.
@@ -5579,33 +5897,43 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
if is_dict_like(dtype):
if self.ndim == 1: # i.e. Series
if len(dtype) > 1 or self.name not in dtype:
- raise KeyError('Only the Series name can be used for '
- 'the key in Series dtype mappings.')
+ raise KeyError(
+ "Only the Series name can be used for "
+ "the key in Series dtype mappings."
+ )
new_type = dtype[self.name]
return self.astype(new_type, copy, errors, **kwargs)
for col_name in dtype.keys():
if col_name not in self:
- raise KeyError('Only a column name can be used for the '
- 'key in a dtype mappings argument.')
+ raise KeyError(
+ "Only a column name can be used for the "
+ "key in a dtype mappings argument."
+ )
results = []
for col_name, col in self.iteritems():
if col_name in dtype:
- results.append(col.astype(dtype=dtype[col_name], copy=copy,
- errors=errors, **kwargs))
+ results.append(
+ col.astype(
+ dtype=dtype[col_name], copy=copy, errors=errors, **kwargs
+ )
+ )
else:
results.append(results.append(col.copy() if copy else col))
elif is_extension_array_dtype(dtype) and self.ndim > 1:
# GH 18099/22869: columnwise conversion to extension dtype
# GH 24704: use iloc to handle duplicate column names
- results = (self.iloc[:, i].astype(dtype, copy=copy)
- for i in range(len(self.columns)))
+ results = (
+ self.iloc[:, i].astype(dtype, copy=copy)
+ for i in range(len(self.columns))
+ )
else:
# else, only a single dtype is given
- new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
- **kwargs)
+ new_data = self._data.astype(
+ dtype=dtype, copy=copy, errors=errors, **kwargs
+ )
return self._constructor(new_data).__finalize__(self)
# GH 19920: retain column metadata after concat
@@ -5735,8 +6063,9 @@ def __deepcopy__(self, memo=None):
memo = {}
return self.copy(deep=True)
- def _convert(self, datetime=False, numeric=False, timedelta=False,
- coerce=False, copy=True):
+ def _convert(
+ self, datetime=False, numeric=False, timedelta=False, coerce=False, copy=True
+ ):
"""
Attempt to infer better dtype for object columns
@@ -5762,9 +6091,14 @@ def _convert(self, datetime=False, numeric=False, timedelta=False,
converted : same as input object
"""
return self._constructor(
- self._data.convert(datetime=datetime, numeric=numeric,
- timedelta=timedelta, coerce=coerce,
- copy=copy)).__finalize__(self)
+ self._data.convert(
+ datetime=datetime,
+ numeric=numeric,
+ timedelta=timedelta,
+ coerce=coerce,
+ copy=copy,
+ )
+ ).__finalize__(self)
def infer_objects(self):
"""
@@ -5809,15 +6143,23 @@ def infer_objects(self):
# python objects will still be converted to
# native numpy numeric types
return self._constructor(
- self._data.convert(datetime=True, numeric=False,
- timedelta=True, coerce=False,
- copy=True)).__finalize__(self)
+ self._data.convert(
+ datetime=True, numeric=False, timedelta=True, coerce=False, copy=True
+ )
+ ).__finalize__(self)
# ----------------------------------------------------------------------
# Filling NA's
- def fillna(self, value=None, method=None, axis=None, inplace=False,
- limit=None, downcast=None):
+ def fillna(
+ self,
+ value=None,
+ method=None,
+ axis=None,
+ inplace=False,
+ limit=None,
+ downcast=None,
+ ):
"""
Fill NA/NaN values using the specified method.
@@ -5914,7 +6256,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False,
2 NaN 1.0 NaN 5
3 NaN 3.0 NaN 4
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
value, method = validate_fillna_kwargs(value, method)
self._consolidate_inplace()
@@ -5926,6 +6268,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False,
axis = self._get_axis_number(axis)
from pandas import DataFrame
+
if value is None:
if self._is_mixed_type and axis == 1:
@@ -5938,10 +6281,14 @@ def fillna(self, value=None, method=None, axis=None, inplace=False,
return result
- new_data = self._data.interpolate(method=method, axis=axis,
- limit=limit, inplace=inplace,
- coerce=True,
- downcast=downcast)
+ new_data = self._data.interpolate(
+ method=method,
+ axis=axis,
+ limit=limit,
+ inplace=inplace,
+ coerce=True,
+ downcast=downcast,
+ )
else:
if len(self._get_axis(axis)) == 0:
return self
@@ -5949,23 +6296,28 @@ def fillna(self, value=None, method=None, axis=None, inplace=False,
if self.ndim == 1:
if isinstance(value, (dict, ABCSeries)):
from pandas import Series
+
value = Series(value)
elif not is_list_like(value):
pass
else:
- raise TypeError('"value" parameter must be a scalar, dict '
- 'or Series, but you passed a '
- '"{0}"'.format(type(value).__name__))
-
- new_data = self._data.fillna(value=value, limit=limit,
- inplace=inplace,
- downcast=downcast)
+ raise TypeError(
+ '"value" parameter must be a scalar, dict '
+ "or Series, but you passed a "
+ '"{0}"'.format(type(value).__name__)
+ )
+
+ new_data = self._data.fillna(
+ value=value, limit=limit, inplace=inplace, downcast=downcast
+ )
elif isinstance(value, (dict, ABCSeries)):
if axis == 1:
- raise NotImplementedError('Currently only can fill '
- 'with dict/Series column '
- 'by column')
+ raise NotImplementedError(
+ "Currently only can fill "
+ "with dict/Series column "
+ "by column"
+ )
result = self if inplace else self.copy()
for k, v in value.items():
@@ -5976,9 +6328,9 @@ def fillna(self, value=None, method=None, axis=None, inplace=False,
return result if not inplace else None
elif not is_list_like(value):
- new_data = self._data.fillna(value=value, limit=limit,
- inplace=inplace,
- downcast=downcast)
+ new_data = self._data.fillna(
+ value=value, limit=limit, inplace=inplace, downcast=downcast
+ )
elif isinstance(value, DataFrame) and self.ndim == 2:
new_data = self.where(self.notna(), value)
else:
@@ -5998,8 +6350,9 @@ def ffill(self, axis=None, inplace=False, limit=None, downcast=None):
%(klass)s
Object with missing values filled.
"""
- return self.fillna(method='ffill', axis=axis, inplace=inplace,
- limit=limit, downcast=downcast)
+ return self.fillna(
+ method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
+ )
def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
"""
@@ -6010,10 +6363,13 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
%(klass)s
Object with missing values filled.
"""
- return self.fillna(method='bfill', axis=axis, inplace=inplace,
- limit=limit, downcast=downcast)
+ return self.fillna(
+ method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
+ )
- _shared_docs['replace'] = ("""
+ _shared_docs[
+ "replace"
+ ] = """
Replace values given in `to_replace` with `value`.
Values of the %(klass)s are replaced with other values dynamically.
@@ -6302,15 +6658,23 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
3 b
4 b
dtype: object
- """)
+ """
- @Appender(_shared_docs['replace'] % _shared_doc_kwargs)
- def replace(self, to_replace=None, value=None, inplace=False, limit=None,
- regex=False, method='pad'):
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ @Appender(_shared_docs["replace"] % _shared_doc_kwargs)
+ def replace(
+ self,
+ to_replace=None,
+ value=None,
+ inplace=False,
+ limit=None,
+ regex=False,
+ method="pad",
+ ):
+ inplace = validate_bool_kwarg(inplace, "inplace")
if not is_bool(regex) and to_replace is not None:
- raise AssertionError("'to_replace' must be 'None' if 'regex' is "
- "not a bool")
+ raise AssertionError(
+ "'to_replace' must be 'None' if 'regex' is " "not a bool"
+ )
self._consolidate_inplace()
@@ -6322,17 +6686,18 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
if isinstance(to_replace, (tuple, list)):
if isinstance(self, pd.DataFrame):
- return self.apply(_single_replace,
- args=(to_replace, method, inplace,
- limit))
- return _single_replace(self, to_replace, method, inplace,
- limit)
+ return self.apply(
+ _single_replace, args=(to_replace, method, inplace, limit)
+ )
+ return _single_replace(self, to_replace, method, inplace, limit)
if not is_dict_like(to_replace):
if not is_dict_like(regex):
- raise TypeError('If "to_replace" and "value" are both None'
- ' and "to_replace" is not a list, then '
- 'regex must be a mapping')
+ raise TypeError(
+ 'If "to_replace" and "value" are both None'
+ ' and "to_replace" is not a list, then '
+ "regex must be a mapping"
+ )
to_replace = regex
regex = True
@@ -6343,9 +6708,11 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
if any(are_mappings):
if not all(are_mappings):
- raise TypeError("If a nested mapping is passed, all values"
- " of the top level mapping must be "
- "mappings")
+ raise TypeError(
+ "If a nested mapping is passed, all values"
+ " of the top level mapping must be "
+ "mappings"
+ )
# passed a nested dict/Series
to_rep_dict = {}
value_dict = {}
@@ -6353,8 +6720,10 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
for k, v in items:
keys, values = list(zip(*v.items())) or ([], [])
if set(keys) & set(values):
- raise ValueError("Replacement not allowed with "
- "overlapping keys and values")
+ raise ValueError(
+ "Replacement not allowed with "
+ "overlapping keys and values"
+ )
to_rep_dict[k] = list(keys)
value_dict[k] = list(values)
@@ -6362,8 +6731,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
else:
to_replace, value = keys, values
- return self.replace(to_replace, value, inplace=inplace,
- limit=limit, regex=regex)
+ return self.replace(
+ to_replace, value, inplace=inplace, limit=limit, regex=regex
+ )
else:
# need a non-zero len on all axes
@@ -6379,55 +6749,67 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
if c in value and c in self:
# object conversion is handled in
# series.replace which is called recursively
- res[c] = res[c].replace(to_replace=src,
- value=value[c],
- inplace=False,
- regex=regex)
+ res[c] = res[c].replace(
+ to_replace=src,
+ value=value[c],
+ inplace=False,
+ regex=regex,
+ )
return None if inplace else res
# {'A': NA} -> 0
elif not is_list_like(value):
- keys = [(k, src) for k, src in to_replace.items()
- if k in self]
+ keys = [(k, src) for k, src in to_replace.items() if k in self]
keys_len = len(keys) - 1
for i, (k, src) in enumerate(keys):
convert = i == keys_len
- new_data = new_data.replace(to_replace=src,
- value=value,
- filter=[k],
- inplace=inplace,
- regex=regex,
- convert=convert)
+ new_data = new_data.replace(
+ to_replace=src,
+ value=value,
+ filter=[k],
+ inplace=inplace,
+ regex=regex,
+ convert=convert,
+ )
else:
- raise TypeError('value argument must be scalar, dict, or '
- 'Series')
+ raise TypeError("value argument must be scalar, dict, or " "Series")
elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing']
if is_list_like(value):
if len(to_replace) != len(value):
- raise ValueError('Replacement lists must match '
- 'in length. Expecting %d got %d ' %
- (len(to_replace), len(value)))
-
- new_data = self._data.replace_list(src_list=to_replace,
- dest_list=value,
- inplace=inplace,
- regex=regex)
+ raise ValueError(
+ "Replacement lists must match "
+ "in length. Expecting %d got %d "
+ % (len(to_replace), len(value))
+ )
+
+ new_data = self._data.replace_list(
+ src_list=to_replace,
+ dest_list=value,
+ inplace=inplace,
+ regex=regex,
+ )
else: # [NA, ''] -> 0
- new_data = self._data.replace(to_replace=to_replace,
- value=value, inplace=inplace,
- regex=regex)
+ new_data = self._data.replace(
+ to_replace=to_replace, value=value, inplace=inplace, regex=regex
+ )
elif to_replace is None:
- if not (is_re_compilable(regex) or
- is_list_like(regex) or is_dict_like(regex)):
- raise TypeError("'regex' must be a string or a compiled "
- "regular expression or a list or dict of "
- "strings or regular expressions, you "
- "passed a"
- " {0!r}".format(type(regex).__name__))
- return self.replace(regex, value, inplace=inplace, limit=limit,
- regex=True)
+ if not (
+ is_re_compilable(regex)
+ or is_list_like(regex)
+ or is_dict_like(regex)
+ ):
+ raise TypeError(
+ "'regex' must be a string or a compiled "
+ "regular expression or a list or dict of "
+ "strings or regular expressions, you "
+ "passed a"
+ " {0!r}".format(type(regex).__name__)
+ )
+ return self.replace(
+ regex, value, inplace=inplace, limit=limit, regex=True
+ )
else:
# dest iterable dict-like
@@ -6436,18 +6818,22 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
for k, v in value.items():
if k in self:
- new_data = new_data.replace(to_replace=to_replace,
- value=v, filter=[k],
- inplace=inplace,
- regex=regex)
+ new_data = new_data.replace(
+ to_replace=to_replace,
+ value=v,
+ filter=[k],
+ inplace=inplace,
+ regex=regex,
+ )
elif not is_list_like(value): # NA -> 0
- new_data = self._data.replace(to_replace=to_replace,
- value=value, inplace=inplace,
- regex=regex)
+ new_data = self._data.replace(
+ to_replace=to_replace, value=value, inplace=inplace, regex=regex
+ )
else:
- msg = ('Invalid "to_replace" type: '
- '{0!r}').format(type(to_replace).__name__)
+ msg = ('Invalid "to_replace" type: ' "{0!r}").format(
+ type(to_replace).__name__
+ )
raise TypeError(msg) # pragma: no cover
if inplace:
@@ -6455,7 +6841,9 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
else:
return self._constructor(new_data).__finalize__(self)
- _shared_docs['interpolate'] = """
+ _shared_docs[
+ "interpolate"
+ ] = """
Please note that only ``method='linear'`` is supported for
DataFrame/Series with a MultiIndex.
@@ -6644,14 +7032,22 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
Name: d, dtype: float64
"""
- @Appender(_shared_docs['interpolate'] % _shared_doc_kwargs)
- def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
- limit_direction='forward', limit_area=None,
- downcast=None, **kwargs):
+ @Appender(_shared_docs["interpolate"] % _shared_doc_kwargs)
+ def interpolate(
+ self,
+ method="linear",
+ axis=0,
+ limit=None,
+ inplace=False,
+ limit_direction="forward",
+ limit_area=None,
+ downcast=None,
+ **kwargs
+ ):
"""
Interpolate values according to different methods.
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if axis == 0:
ax = self._info_axis_name
@@ -6668,47 +7064,59 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
else:
alt_ax = ax
- if (isinstance(_maybe_transposed_self.index, MultiIndex) and
- method != 'linear'):
- raise ValueError("Only `method=linear` interpolation is supported "
- "on MultiIndexes.")
+ if isinstance(_maybe_transposed_self.index, MultiIndex) and method != "linear":
+ raise ValueError(
+ "Only `method=linear` interpolation is supported " "on MultiIndexes."
+ )
- if _maybe_transposed_self._data.get_dtype_counts().get(
- 'object') == len(_maybe_transposed_self.T):
- raise TypeError("Cannot interpolate with all object-dtype columns "
- "in the DataFrame. Try setting at least one "
- "column to a numeric dtype.")
+ if _maybe_transposed_self._data.get_dtype_counts().get("object") == len(
+ _maybe_transposed_self.T
+ ):
+ raise TypeError(
+ "Cannot interpolate with all object-dtype columns "
+ "in the DataFrame. Try setting at least one "
+ "column to a numeric dtype."
+ )
# create/use the index
- if method == 'linear':
+ if method == "linear":
# prior default
index = np.arange(len(_maybe_transposed_self._get_axis(alt_ax)))
else:
index = _maybe_transposed_self._get_axis(alt_ax)
methods = {"index", "values", "nearest", "time"}
is_numeric_or_datetime = (
- is_numeric_dtype(index) or
- is_datetime64_dtype(index) or
- is_timedelta64_dtype(index)
+ is_numeric_dtype(index)
+ or is_datetime64_dtype(index)
+ or is_timedelta64_dtype(index)
)
if method not in methods and not is_numeric_or_datetime:
raise ValueError(
"Index column must be numeric or datetime type when "
"using {method} method other than linear. "
"Try setting a numeric or datetime index column before "
- "interpolating.".format(method=method))
+ "interpolating.".format(method=method)
+ )
if isna(index).any():
- raise NotImplementedError("Interpolation with NaNs in the index "
- "has not been implemented. Try filling "
- "those NaNs before interpolating.")
+ raise NotImplementedError(
+ "Interpolation with NaNs in the index "
+ "has not been implemented. Try filling "
+ "those NaNs before interpolating."
+ )
data = _maybe_transposed_self._data
- new_data = data.interpolate(method=method, axis=ax, index=index,
- values=_maybe_transposed_self, limit=limit,
- limit_direction=limit_direction,
- limit_area=limit_area,
- inplace=inplace, downcast=downcast,
- **kwargs)
+ new_data = data.interpolate(
+ method=method,
+ axis=ax,
+ index=index,
+ values=_maybe_transposed_self,
+ limit=limit,
+ limit_direction=limit_direction,
+ limit_area=limit_area,
+ inplace=inplace,
+ downcast=downcast,
+ **kwargs
+ )
if inplace:
if axis == 1:
@@ -6823,6 +7231,7 @@ def asof(self, where, subset=None):
"""
if isinstance(where, str):
from pandas import to_datetime
+
where = to_datetime(where)
if not self.index.is_monotonic:
@@ -6848,6 +7257,7 @@ def asof(self, where, subset=None):
if where < start:
if not is_series:
from pandas import Series
+
return Series(index=self.columns, name=where)
return np.nan
@@ -6858,7 +7268,7 @@ def asof(self, where, subset=None):
# code path whether *where* is a scalar or list.
# See PR: https://github.com/pandas-dev/pandas/pull/14476
if is_series:
- loc = self.index.searchsorted(where, side='right')
+ loc = self.index.searchsorted(where, side="right")
if loc > 0:
loc -= 1
@@ -6876,9 +7286,11 @@ def asof(self, where, subset=None):
return self._constructor(np.nan, index=where, name=self.name)
elif is_list:
from pandas import DataFrame
+
return DataFrame(np.nan, index=where, columns=self.columns)
else:
from pandas import Series
+
return Series(np.nan, index=self.columns, name=where[0])
locs = self.index.asof_locs(where, ~(nulls.values))
@@ -6893,7 +7305,9 @@ def asof(self, where, subset=None):
# ----------------------------------------------------------------------
# Action Methods
- _shared_docs['isna'] = """
+ _shared_docs[
+ "isna"
+ ] = """
Detect missing values.
Return a boolean same-sized object indicating if the values are NA.
@@ -6953,15 +7367,17 @@ def asof(self, where, subset=None):
dtype: bool
"""
- @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
+ @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
def isna(self):
return isna(self).__finalize__(self)
- @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
+ @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
def isnull(self):
return isna(self).__finalize__(self)
- _shared_docs['notna'] = """
+ _shared_docs[
+ "notna"
+ ] = """
Detect existing (non-missing) values.
Return a boolean same-sized object indicating if the values are not NA.
@@ -7021,23 +7437,24 @@ def isnull(self):
dtype: bool
"""
- @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
+ @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
def notna(self):
return notna(self).__finalize__(self)
- @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
+ @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
def notnull(self):
return notna(self).__finalize__(self)
def _clip_with_scalar(self, lower, upper, inplace=False):
- if ((lower is not None and np.any(isna(lower))) or
- (upper is not None and np.any(isna(upper)))):
+ if (lower is not None and np.any(isna(lower))) or (
+ upper is not None and np.any(isna(upper))
+ ):
raise ValueError("Cannot use an NA value as a clip threshold")
result = self
mask = isna(self.values)
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
if upper is not None:
subset = self.to_numpy() <= upper
result = result.where(subset, upper, axis=None, inplace=False)
@@ -7060,7 +7477,7 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace):
# method is self.le for upper bound and self.ge for lower bound
if is_scalar(threshold) and is_number(threshold):
- if method.__name__ == 'le':
+ if method.__name__ == "le":
return self._clip_with_scalar(None, threshold, inplace=inplace)
return self._clip_with_scalar(threshold, None, inplace=inplace)
@@ -7073,12 +7490,10 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace):
if isinstance(self, ABCSeries):
threshold = pd.Series(threshold, index=self.index)
else:
- threshold = _align_method_FRAME(self, threshold,
- axis)
+ threshold = _align_method_FRAME(self, threshold, axis)
return self.where(subset, threshold, axis=axis, inplace=inplace)
- def clip(self, lower=None, upper=None, axis=None, inplace=False,
- *args, **kwargs):
+ def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs):
"""
Trim values at input threshold(s).
@@ -7151,7 +7566,7 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False,
3 6 8
4 5 3
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
axis = nv.validate_clip_with_axis(axis, args, kwargs)
if axis is not None:
@@ -7173,19 +7588,22 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False,
lower, upper = min(lower, upper), max(lower, upper)
# fast-path for scalars
- if ((lower is None or (is_scalar(lower) and is_number(lower))) and
- (upper is None or (is_scalar(upper) and is_number(upper)))):
+ if (lower is None or (is_scalar(lower) and is_number(lower))) and (
+ upper is None or (is_scalar(upper) and is_number(upper))
+ ):
return self._clip_with_scalar(lower, upper, inplace=inplace)
result = self
if lower is not None:
- result = result._clip_with_one_bound(lower, method=self.ge,
- axis=axis, inplace=inplace)
+ result = result._clip_with_one_bound(
+ lower, method=self.ge, axis=axis, inplace=inplace
+ )
if upper is not None:
if inplace:
result = self
- result = result._clip_with_one_bound(upper, method=self.le,
- axis=axis, inplace=inplace)
+ result = result._clip_with_one_bound(
+ upper, method=self.le, axis=axis, inplace=inplace
+ )
return result
@@ -7263,11 +7681,14 @@ def clip_upper(self, threshold, axis=None, inplace=False):
4 1
dtype: int64
"""
- warnings.warn('clip_upper(threshold) is deprecated, '
- 'use clip(upper=threshold) instead',
- FutureWarning, stacklevel=2)
- return self._clip_with_one_bound(threshold, method=self.le,
- axis=axis, inplace=inplace)
+ warnings.warn(
+ "clip_upper(threshold) is deprecated, " "use clip(upper=threshold) instead",
+ FutureWarning,
+ stacklevel=2,
+ )
+ return self._clip_with_one_bound(
+ threshold, method=self.le, axis=axis, inplace=inplace
+ )
def clip_lower(self, threshold, axis=None, inplace=False):
"""
@@ -7379,14 +7800,27 @@ def clip_lower(self, threshold, axis=None, inplace=False):
1 4 5
2 5 6
"""
- warnings.warn('clip_lower(threshold) is deprecated, '
- 'use clip(lower=threshold) instead',
- FutureWarning, stacklevel=2)
- return self._clip_with_one_bound(threshold, method=self.ge,
- axis=axis, inplace=inplace)
-
- def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
- group_keys=True, squeeze=False, observed=False, **kwargs):
+ warnings.warn(
+ "clip_lower(threshold) is deprecated, " "use clip(lower=threshold) instead",
+ FutureWarning,
+ stacklevel=2,
+ )
+ return self._clip_with_one_bound(
+ threshold, method=self.ge, axis=axis, inplace=inplace
+ )
+
+ def groupby(
+ self,
+ by=None,
+ axis=0,
+ level=None,
+ as_index=True,
+ sort=True,
+ group_keys=True,
+ squeeze=False,
+ observed=False,
+ **kwargs
+ ):
"""
Group DataFrame or Series using a mapper or by a Series of columns.
@@ -7501,12 +7935,20 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
if level is None and by is None:
raise TypeError("You have to supply one of 'by' and 'level'")
axis = self._get_axis_number(axis)
- return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
- sort=sort, group_keys=group_keys, squeeze=squeeze,
- observed=observed, **kwargs)
-
- def asfreq(self, freq, method=None, how=None, normalize=False,
- fill_value=None):
+ return groupby(
+ self,
+ by=by,
+ axis=axis,
+ level=level,
+ as_index=as_index,
+ sort=sort,
+ group_keys=group_keys,
+ squeeze=squeeze,
+ observed=observed,
+ **kwargs
+ )
+
+ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None):
"""
Convert TimeSeries to specified frequency.
@@ -7601,8 +8043,15 @@ def asfreq(self, freq, method=None, how=None, normalize=False,
2000-01-01 00:03:00 3.0
"""
from pandas.core.resample import asfreq
- return asfreq(self, freq, method=method, how=how, normalize=normalize,
- fill_value=fill_value)
+
+ return asfreq(
+ self,
+ freq,
+ method=method,
+ how=how,
+ normalize=normalize,
+ fill_value=fill_value,
+ )
def at_time(self, time, asof=False, axis=None):
"""
@@ -7656,12 +8105,13 @@ def at_time(self, time, asof=False, axis=None):
try:
indexer = index.indexer_at_time(time, asof=asof)
except AttributeError:
- raise TypeError('Index must be DatetimeIndex')
+ raise TypeError("Index must be DatetimeIndex")
return self._take(indexer, axis=axis)
- def between_time(self, start_time, end_time, include_start=True,
- include_end=True, axis=None):
+ def between_time(
+ self, start_time, end_time, include_start=True, include_end=True, axis=None
+ ):
"""
Select values between particular times of the day (e.g., 9:00-9:30 AM).
@@ -7726,16 +8176,32 @@ def between_time(self, start_time, end_time, include_start=True,
index = self._get_axis(axis)
try:
indexer = index.indexer_between_time(
- start_time, end_time, include_start=include_start,
- include_end=include_end)
+ start_time,
+ end_time,
+ include_start=include_start,
+ include_end=include_end,
+ )
except AttributeError:
- raise TypeError('Index must be DatetimeIndex')
+ raise TypeError("Index must be DatetimeIndex")
return self._take(indexer, axis=axis)
- def resample(self, rule, how=None, axis=0, fill_method=None, closed=None,
- label=None, convention='start', kind=None, loffset=None,
- limit=None, base=0, on=None, level=None):
+ def resample(
+ self,
+ rule,
+ how=None,
+ axis=0,
+ fill_method=None,
+ closed=None,
+ label=None,
+ convention="start",
+ kind=None,
+ loffset=None,
+ limit=None,
+ base=0,
+ on=None,
+ level=None,
+ ):
"""
Resample time-series data.
@@ -8020,17 +8486,25 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None,
2000-01-04 36 90
"""
- from pandas.core.resample import (resample,
- _maybe_process_deprecations)
+ from pandas.core.resample import resample, _maybe_process_deprecations
+
axis = self._get_axis_number(axis)
- r = resample(self, freq=rule, label=label, closed=closed,
- axis=axis, kind=kind, loffset=loffset,
- convention=convention,
- base=base, key=on, level=level)
- return _maybe_process_deprecations(r,
- how=how,
- fill_method=fill_method,
- limit=limit)
+ r = resample(
+ self,
+ freq=rule,
+ label=label,
+ closed=closed,
+ axis=axis,
+ kind=kind,
+ loffset=loffset,
+ convention=convention,
+ base=base,
+ key=on,
+ level=level,
+ )
+ return _maybe_process_deprecations(
+ r, how=how, fill_method=fill_method, limit=limit
+ )
def first(self, offset):
"""
@@ -8088,9 +8562,9 @@ def first(self, offset):
end_date = end = self.index[0] + offset
# Tick-like, e.g. 3 weeks
- if not offset.isAnchored() and hasattr(offset, '_inc'):
+ if not offset.isAnchored() and hasattr(offset, "_inc"):
if end_date in self.index:
- end = self.index.searchsorted(end_date, side='left')
+ end = self.index.searchsorted(end_date, side="left")
return self.iloc[:end]
return self.loc[:end]
@@ -8150,11 +8624,18 @@ def last(self, offset):
offset = to_offset(offset)
start_date = self.index[-1] - offset
- start = self.index.searchsorted(start_date, side='right')
+ start = self.index.searchsorted(start_date, side="right")
return self.iloc[start:]
- def rank(self, axis=0, method='average', numeric_only=None,
- na_option='keep', ascending=True, pct=False):
+ def rank(
+ self,
+ axis=0,
+ method="average",
+ numeric_only=None,
+ na_option="keep",
+ ascending=True,
+ pct=False,
+ ):
"""
Compute numerical data ranks (1 through n) along axis.
@@ -8238,14 +8719,19 @@ def rank(self, axis=0, method='average', numeric_only=None,
"""
axis = self._get_axis_number(axis)
- if na_option not in {'keep', 'top', 'bottom'}:
+ if na_option not in {"keep", "top", "bottom"}:
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
raise ValueError(msg)
def ranker(data):
- ranks = algos.rank(data.values, axis=axis, method=method,
- ascending=ascending, na_option=na_option,
- pct=pct)
+ ranks = algos.rank(
+ data.values,
+ axis=axis,
+ method=method,
+ ascending=ascending,
+ na_option=na_option,
+ pct=pct,
+ )
ranks = self._constructor(ranks, **data._construct_axes_dict())
return ranks.__finalize__(self)
@@ -8264,7 +8750,9 @@ def ranker(data):
return ranker(data)
- _shared_docs['align'] = ("""
+ _shared_docs[
+ "align"
+ ] = """
Align two objects on their axes with the
specified join method for each axis Index.
@@ -8304,13 +8792,24 @@ def ranker(data):
-------
(left, right) : (%(klass)s, type of other)
Aligned objects.
- """)
+ """
- @Appender(_shared_docs['align'] % _shared_doc_kwargs)
- def align(self, other, join='outer', axis=None, level=None, copy=True,
- fill_value=None, method=None, limit=None, fill_axis=0,
- broadcast_axis=None):
+ @Appender(_shared_docs["align"] % _shared_doc_kwargs)
+ def align(
+ self,
+ other,
+ join="outer",
+ axis=None,
+ level=None,
+ copy=True,
+ fill_value=None,
+ method=None,
+ limit=None,
+ fill_axis=0,
+ broadcast_axis=None,
+ ):
from pandas import DataFrame, Series
+
method = missing.clean_fill_method(method)
if broadcast_axis == 1 and self.ndim != other.ndim:
@@ -8318,41 +8817,80 @@ def align(self, other, join='outer', axis=None, level=None, copy=True,
# this means other is a DataFrame, and we need to broadcast
# self
cons = self._constructor_expanddim
- df = cons({c: self for c in other.columns},
- **other._construct_axes_dict())
- return df._align_frame(other, join=join, axis=axis,
- level=level, copy=copy,
- fill_value=fill_value, method=method,
- limit=limit, fill_axis=fill_axis)
+ df = cons(
+ {c: self for c in other.columns}, **other._construct_axes_dict()
+ )
+ return df._align_frame(
+ other,
+ join=join,
+ axis=axis,
+ level=level,
+ copy=copy,
+ fill_value=fill_value,
+ method=method,
+ limit=limit,
+ fill_axis=fill_axis,
+ )
elif isinstance(other, Series):
# this means self is a DataFrame, and we need to broadcast
# other
cons = other._constructor_expanddim
- df = cons({c: other for c in self.columns},
- **self._construct_axes_dict())
- return self._align_frame(df, join=join, axis=axis, level=level,
- copy=copy, fill_value=fill_value,
- method=method, limit=limit,
- fill_axis=fill_axis)
+ df = cons(
+ {c: other for c in self.columns}, **self._construct_axes_dict()
+ )
+ return self._align_frame(
+ df,
+ join=join,
+ axis=axis,
+ level=level,
+ copy=copy,
+ fill_value=fill_value,
+ method=method,
+ limit=limit,
+ fill_axis=fill_axis,
+ )
if axis is not None:
axis = self._get_axis_number(axis)
if isinstance(other, DataFrame):
- return self._align_frame(other, join=join, axis=axis, level=level,
- copy=copy, fill_value=fill_value,
- method=method, limit=limit,
- fill_axis=fill_axis)
+ return self._align_frame(
+ other,
+ join=join,
+ axis=axis,
+ level=level,
+ copy=copy,
+ fill_value=fill_value,
+ method=method,
+ limit=limit,
+ fill_axis=fill_axis,
+ )
elif isinstance(other, Series):
- return self._align_series(other, join=join, axis=axis, level=level,
- copy=copy, fill_value=fill_value,
- method=method, limit=limit,
- fill_axis=fill_axis)
+ return self._align_series(
+ other,
+ join=join,
+ axis=axis,
+ level=level,
+ copy=copy,
+ fill_value=fill_value,
+ method=method,
+ limit=limit,
+ fill_axis=fill_axis,
+ )
else: # pragma: no cover
- raise TypeError('unsupported type: %s' % type(other))
-
- def _align_frame(self, other, join='outer', axis=None, level=None,
- copy=True, fill_value=None, method=None, limit=None,
- fill_axis=0):
+ raise TypeError("unsupported type: %s" % type(other))
+
+ def _align_frame(
+ self,
+ other,
+ join="outer",
+ axis=None,
+ level=None,
+ copy=True,
+ fill_value=None,
+ method=None,
+ limit=None,
+ fill_axis=0,
+ ):
# defaults
join_index, join_columns = None, None
ilidx, iridx = None, None
@@ -8363,26 +8901,30 @@ def _align_frame(self, other, join='outer', axis=None, level=None,
if axis is None or axis == 0:
if not self.index.equals(other.index):
join_index, ilidx, iridx = self.index.join(
- other.index, how=join, level=level, return_indexers=True)
+ other.index, how=join, level=level, return_indexers=True
+ )
if axis is None or axis == 1:
if not is_series and not self.columns.equals(other.columns):
join_columns, clidx, cridx = self.columns.join(
- other.columns, how=join, level=level, return_indexers=True)
+ other.columns, how=join, level=level, return_indexers=True
+ )
if is_series:
reindexers = {0: [join_index, ilidx]}
else:
reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]}
- left = self._reindex_with_indexers(reindexers, copy=copy,
- fill_value=fill_value,
- allow_dups=True)
+ left = self._reindex_with_indexers(
+ reindexers, copy=copy, fill_value=fill_value, allow_dups=True
+ )
# other must be always DataFrame
- right = other._reindex_with_indexers({0: [join_index, iridx],
- 1: [join_columns, cridx]},
- copy=copy, fill_value=fill_value,
- allow_dups=True)
+ right = other._reindex_with_indexers(
+ {0: [join_index, iridx], 1: [join_columns, cridx]},
+ copy=copy,
+ fill_value=fill_value,
+ allow_dups=True,
+ )
if method is not None:
left = left.fillna(axis=fill_axis, method=method, limit=limit)
@@ -8397,25 +8939,33 @@ def _align_frame(self, other, join='outer', axis=None, level=None,
return left.__finalize__(self), right.__finalize__(other)
- def _align_series(self, other, join='outer', axis=None, level=None,
- copy=True, fill_value=None, method=None, limit=None,
- fill_axis=0):
+ def _align_series(
+ self,
+ other,
+ join="outer",
+ axis=None,
+ level=None,
+ copy=True,
+ fill_value=None,
+ method=None,
+ limit=None,
+ fill_axis=0,
+ ):
is_series = isinstance(self, ABCSeries)
# series/series compat, other must always be a Series
if is_series:
if axis:
- raise ValueError('cannot align series to a series other than '
- 'axis 0')
+ raise ValueError("cannot align series to a series other than " "axis 0")
# equal
if self.index.equals(other.index):
join_index, lidx, ridx = None, None, None
else:
- join_index, lidx, ridx = self.index.join(other.index, how=join,
- level=level,
- return_indexers=True)
+ join_index, lidx, ridx = self.index.join(
+ other.index, how=join, level=level, return_indexers=True
+ )
left = self._reindex_indexer(join_index, lidx, copy)
right = other._reindex_indexer(join_index, ridx, copy)
@@ -8428,8 +8978,8 @@ def _align_series(self, other, join='outer', axis=None, level=None,
lidx, ridx = None, None
if not self.index.equals(other.index):
join_index, lidx, ridx = self.index.join(
- other.index, how=join, level=level,
- return_indexers=True)
+ other.index, how=join, level=level, return_indexers=True
+ )
if lidx is not None:
fdata = fdata.reindex_indexer(join_index, lidx, axis=1)
@@ -8439,13 +8989,13 @@ def _align_series(self, other, join='outer', axis=None, level=None,
lidx, ridx = None, None
if not self.columns.equals(other.index):
join_index, lidx, ridx = self.columns.join(
- other.index, how=join, level=level,
- return_indexers=True)
+ other.index, how=join, level=level, return_indexers=True
+ )
if lidx is not None:
fdata = fdata.reindex_indexer(join_index, lidx, axis=0)
else:
- raise ValueError('Must specify axis=0 or 1')
+ raise ValueError("Must specify axis=0 or 1")
if copy and fdata is self._data:
fdata = fdata.copy()
@@ -8460,8 +9010,7 @@ def _align_series(self, other, join='outer', axis=None, level=None,
# fill
fill_na = notna(fill_value) or (method is not None)
if fill_na:
- left = left.fillna(fill_value, method=method, limit=limit,
- axis=fill_axis)
+ left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis)
right = right.fillna(fill_value, method=method, limit=limit)
# if DatetimeIndex have different tz, convert to UTC
@@ -8474,24 +9023,31 @@ def _align_series(self, other, join='outer', axis=None, level=None,
return left.__finalize__(self), right.__finalize__(other)
- def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
- errors='raise', try_cast=False):
+ def _where(
+ self,
+ cond,
+ other=np.nan,
+ inplace=False,
+ axis=None,
+ level=None,
+ errors="raise",
+ try_cast=False,
+ ):
"""
Equivalent to public method `where`, except that `other` is not
applied as a function even if callable. Used in __setitem__.
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
# align the cond to same shape as myself
cond = com.apply_if_callable(cond, self)
if isinstance(cond, NDFrame):
- cond, _ = cond.align(self, join='right', broadcast_axis=1)
+ cond, _ = cond.align(self, join="right", broadcast_axis=1)
else:
- if not hasattr(cond, 'shape'):
+ if not hasattr(cond, "shape"):
cond = np.asanyarray(cond)
if cond.shape != self.shape:
- raise ValueError('Array conditional must be same shape as '
- 'self')
+ raise ValueError("Array conditional must be same shape as " "self")
cond = self._constructor(cond, **self._construct_axes_dict())
# make sure we are boolean
@@ -8513,24 +9069,26 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
# try to align with other
try_quick = True
- if hasattr(other, 'align'):
+ if hasattr(other, "align"):
# align with me
if other.ndim <= self.ndim:
- _, other = self.align(other, join='left', axis=axis,
- level=level, fill_value=np.nan)
+ _, other = self.align(
+ other, join="left", axis=axis, level=level, fill_value=np.nan
+ )
# if we are NOT aligned, raise as we cannot where index
- if (axis is None and
- not all(other._get_axis(i).equals(ax)
- for i, ax in enumerate(self.axes))):
+ if axis is None and not all(
+ other._get_axis(i).equals(ax) for i, ax in enumerate(self.axes)
+ ):
raise InvalidIndexError
# slice me out of the other
else:
- raise NotImplementedError("cannot align with a higher "
- "dimensional NDFrame")
+ raise NotImplementedError(
+ "cannot align with a higher " "dimensional NDFrame"
+ )
if isinstance(other, np.ndarray):
@@ -8571,12 +9129,14 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
other = new_other
else:
- raise ValueError('Length of replacements must equal '
- 'series length')
+ raise ValueError(
+ "Length of replacements must equal " "series length"
+ )
else:
- raise ValueError('other must be the same shape as self '
- 'when an ndarray')
+ raise ValueError(
+ "other must be the same shape as self " "when an ndarray"
+ )
# we are the same shape, so create an actual object for alignment
else:
@@ -8585,10 +9145,10 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
if axis is None:
axis = 0
- if self.ndim == getattr(other, 'ndim', 0):
+ if self.ndim == getattr(other, "ndim", 0):
align = True
else:
- align = (self._get_axis_number(axis) == 1)
+ align = self._get_axis_number(axis) == 1
block_axis = self._get_block_manager_axis(axis)
@@ -8597,20 +9157,32 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
# reconstruct the block manager
self._check_inplace_setting(other)
- new_data = self._data.putmask(mask=cond, new=other, align=align,
- inplace=True, axis=block_axis,
- transpose=self._AXIS_REVERSED)
+ new_data = self._data.putmask(
+ mask=cond,
+ new=other,
+ align=align,
+ inplace=True,
+ axis=block_axis,
+ transpose=self._AXIS_REVERSED,
+ )
self._update_inplace(new_data)
else:
- new_data = self._data.where(other=other, cond=cond, align=align,
- errors=errors,
- try_cast=try_cast, axis=block_axis,
- transpose=self._AXIS_REVERSED)
+ new_data = self._data.where(
+ other=other,
+ cond=cond,
+ align=align,
+ errors=errors,
+ try_cast=try_cast,
+ axis=block_axis,
+ transpose=self._AXIS_REVERSED,
+ )
return self._constructor(new_data).__finalize__(self)
- _shared_docs['where'] = ("""
+ _shared_docs[
+ "where"
+ ] = """
Replace values where the condition is %(cond_rev)s.
Parameters
@@ -8731,36 +9303,75 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
2 True True
3 True True
4 True True
- """)
+ """
- @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="True",
- cond_rev="False", name='where',
- name_other='mask'))
- def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
- errors='raise', try_cast=False):
+ @Appender(
+ _shared_docs["where"]
+ % dict(
+ _shared_doc_kwargs,
+ cond="True",
+ cond_rev="False",
+ name="where",
+ name_other="mask",
+ )
+ )
+ def where(
+ self,
+ cond,
+ other=np.nan,
+ inplace=False,
+ axis=None,
+ level=None,
+ errors="raise",
+ try_cast=False,
+ ):
other = com.apply_if_callable(other, self)
- return self._where(cond, other, inplace, axis, level,
- errors=errors, try_cast=try_cast)
-
- @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="False",
- cond_rev="True", name='mask',
- name_other='where'))
- def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None,
- errors='raise', try_cast=False):
-
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ return self._where(
+ cond, other, inplace, axis, level, errors=errors, try_cast=try_cast
+ )
+
+ @Appender(
+ _shared_docs["where"]
+ % dict(
+ _shared_doc_kwargs,
+ cond="False",
+ cond_rev="True",
+ name="mask",
+ name_other="where",
+ )
+ )
+ def mask(
+ self,
+ cond,
+ other=np.nan,
+ inplace=False,
+ axis=None,
+ level=None,
+ errors="raise",
+ try_cast=False,
+ ):
+
+ inplace = validate_bool_kwarg(inplace, "inplace")
cond = com.apply_if_callable(cond, self)
# see gh-21891
if not hasattr(cond, "__invert__"):
cond = np.array(cond)
- return self.where(~cond, other=other, inplace=inplace, axis=axis,
- level=level, try_cast=try_cast,
- errors=errors)
-
- _shared_docs['shift'] = ("""
+ return self.where(
+ ~cond,
+ other=other,
+ inplace=inplace,
+ axis=axis,
+ level=level,
+ try_cast=try_cast,
+ errors=errors,
+ )
+
+ _shared_docs[
+ "shift"
+ ] = """
Shift index by desired number of periods with an optional time `freq`.
When `freq` is not passed, shift the index without realigning the data.
@@ -8830,17 +9441,18 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None,
2 0 0 0
3 10 13 17
4 20 23 27
- """)
+ """
- @Appender(_shared_docs['shift'] % _shared_doc_kwargs)
+ @Appender(_shared_docs["shift"] % _shared_doc_kwargs)
def shift(self, periods=1, freq=None, axis=0, fill_value=None):
if periods == 0:
return self.copy()
block_axis = self._get_block_manager_axis(axis)
if freq is None:
- new_data = self._data.shift(periods=periods, axis=block_axis,
- fill_value=fill_value)
+ new_data = self._data.shift(
+ periods=periods, axis=block_axis, fill_value=fill_value
+ )
else:
return self.tshift(periods, freq)
@@ -8908,13 +9520,13 @@ def tshift(self, periods=1, freq=None, axis=0):
index = self._get_axis(axis)
if freq is None:
- freq = getattr(index, 'freq', None)
+ freq = getattr(index, "freq", None)
if freq is None:
- freq = getattr(index, 'inferred_freq', None)
+ freq = getattr(index, "inferred_freq", None)
if freq is None:
- msg = 'Freq was not given and was not set in the index'
+ msg = "Freq was not given and was not set in the index"
raise ValueError(msg)
if periods == 0:
@@ -8930,8 +9542,10 @@ def tshift(self, periods=1, freq=None, axis=0):
new_data = self._data.copy()
new_data.axes[block_axis] = index.shift(periods)
else:
- msg = ('Given freq %s does not match PeriodIndex freq %s' %
- (freq.rule_code, orig_freq.rule_code))
+ msg = "Given freq %s does not match PeriodIndex freq %s" % (
+ freq.rule_code,
+ orig_freq.rule_code,
+ )
raise ValueError(msg)
else:
new_data = self._data.copy()
@@ -9072,21 +9686,20 @@ def truncate(self, before=None, after=None, axis=None, copy=True):
# treat like a slice
if ax.is_all_dates:
from pandas.core.tools.datetimes import to_datetime
+
before = to_datetime(before)
after = to_datetime(after)
if before is not None and after is not None:
if before > after:
- raise ValueError('Truncate: %s must be after %s' %
- (after, before))
+ raise ValueError("Truncate: %s must be after %s" % (after, before))
slicer = [slice(None, None)] * self._AXIS_LEN
slicer[axis] = slice(before, after)
result = self.loc[tuple(slicer)]
if isinstance(ax, MultiIndex):
- setattr(result, self._get_axis_name(axis),
- ax.truncate(before, after))
+ setattr(result, self._get_axis_name(axis), ax.truncate(before, after))
if copy:
result = result.copy()
@@ -9121,11 +9734,12 @@ def tz_convert(self, tz, axis=0, level=None, copy=True):
ax = self._get_axis(axis)
def _tz_convert(ax, tz):
- if not hasattr(ax, 'tz_convert'):
+ if not hasattr(ax, "tz_convert"):
if len(ax) > 0:
ax_name = self._get_axis_name(axis)
- raise TypeError('%s is not a valid DatetimeIndex or '
- 'PeriodIndex' % ax_name)
+ raise TypeError(
+ "%s is not a valid DatetimeIndex or " "PeriodIndex" % ax_name
+ )
else:
ax = DatetimeIndex([], tz=tz)
else:
@@ -9147,8 +9761,9 @@ def _tz_convert(ax, tz):
result = result.set_axis(ax, axis=axis, inplace=False)
return result.__finalize__(self)
- def tz_localize(self, tz, axis=0, level=None, copy=True,
- ambiguous='raise', nonexistent='raise'):
+ def tz_localize(
+ self, tz, axis=0, level=None, copy=True, ambiguous="raise", nonexistent="raise"
+ ):
"""
Localize tz-naive index of a Series or DataFrame to target time zone.
@@ -9269,38 +9884,37 @@ def tz_localize(self, tz, axis=0, level=None, copy=True,
2015-03-29 03:30:00+02:00 1
dtype: int64
"""
- nonexistent_options = ('raise', 'NaT', 'shift_forward',
- 'shift_backward')
+ nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")
if nonexistent not in nonexistent_options and not isinstance(
- nonexistent, timedelta):
- raise ValueError("The nonexistent argument must be one of 'raise',"
- " 'NaT', 'shift_forward', 'shift_backward' or"
- " a timedelta object")
+ nonexistent, timedelta
+ ):
+ raise ValueError(
+ "The nonexistent argument must be one of 'raise',"
+ " 'NaT', 'shift_forward', 'shift_backward' or"
+ " a timedelta object"
+ )
axis = self._get_axis_number(axis)
ax = self._get_axis(axis)
def _tz_localize(ax, tz, ambiguous, nonexistent):
- if not hasattr(ax, 'tz_localize'):
+ if not hasattr(ax, "tz_localize"):
if len(ax) > 0:
ax_name = self._get_axis_name(axis)
- raise TypeError('%s is not a valid DatetimeIndex or '
- 'PeriodIndex' % ax_name)
+ raise TypeError(
+ "%s is not a valid DatetimeIndex or " "PeriodIndex" % ax_name
+ )
else:
ax = DatetimeIndex([], tz=tz)
else:
- ax = ax.tz_localize(
- tz, ambiguous=ambiguous, nonexistent=nonexistent
- )
+ ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent)
return ax
# if a level is given it must be a MultiIndex level or
# equivalent to the axis name
if isinstance(ax, MultiIndex):
level = ax._get_level_number(level)
- new_level = _tz_localize(
- ax.levels[level], tz, ambiguous, nonexistent
- )
+ new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent)
ax = ax.set_levels(new_level, level=level)
else:
if level not in (None, 0, ax.name):
@@ -9641,14 +10255,18 @@ def describe(self, percentiles=None, include=None, exclude=None):
formatted_percentiles = format_percentiles(percentiles)
def describe_numeric_1d(series):
- stat_index = (['count', 'mean', 'std', 'min'] +
- formatted_percentiles + ['max'])
- d = ([series.count(), series.mean(), series.std(), series.min()] +
- series.quantile(percentiles).tolist() + [series.max()])
+ stat_index = (
+ ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
+ )
+ d = (
+ [series.count(), series.mean(), series.std(), series.min()]
+ + series.quantile(percentiles).tolist()
+ + [series.max()]
+ )
return pd.Series(d, index=stat_index, name=series.name)
def describe_categorical_1d(data):
- names = ['count', 'unique']
+ names = ["count", "unique"]
objcounts = data.value_counts()
count_unique = len(objcounts[objcounts != 0])
result = [data.count(), count_unique]
@@ -9658,27 +10276,30 @@ def describe_categorical_1d(data):
if is_datetime64_any_dtype(data):
tz = data.dt.tz
- asint = data.dropna().values.view('i8')
+ asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
- names += ['top', 'freq', 'first', 'last']
- result += [top, freq,
- Timestamp(asint.min(), tz=tz),
- Timestamp(asint.max(), tz=tz)]
+ names += ["top", "freq", "first", "last"]
+ result += [
+ top,
+ freq,
+ Timestamp(asint.min(), tz=tz),
+ Timestamp(asint.max(), tz=tz),
+ ]
else:
- names += ['top', 'freq']
+ names += ["top", "freq"]
result += [top, freq]
# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
- names += ['top', 'freq']
+ names += ["top", "freq"]
result += [np.nan, np.nan]
- dtype = 'object'
+ dtype = "object"
return pd.Series(result, index=names, name=data.name, dtype=dtype)
@@ -9699,7 +10320,7 @@ def describe_1d(data):
data = self.select_dtypes(include=[np.number])
if len(data.columns) == 0:
data = self
- elif include == 'all':
+ elif include == "all":
if exclude is not None:
msg = "exclude must be None when include is 'all'"
raise ValueError(msg)
@@ -9716,8 +10337,7 @@ def describe_1d(data):
if name not in names:
names.append(name)
- d = pd.concat([x.reindex(names, copy=False) for x in ldesc],
- axis=1, sort=False)
+ d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
d.columns = data.columns.copy()
return d
@@ -9726,8 +10346,7 @@ def _check_percentile(self, q):
Validate percentiles (used by describe and quantile).
"""
- msg = ("percentiles should all be in the interval [0, 1]. "
- "Try {0} instead.")
+ msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead."
q = np.asarray(q)
if q.ndim == 0:
if not 0 <= q <= 1:
@@ -9737,7 +10356,9 @@ def _check_percentile(self, q):
raise ValueError(msg.format(q / 100.0))
return q
- _shared_docs['pct_change'] = """
+ _shared_docs[
+ "pct_change"
+ ] = """
Percentage change between the current and a prior element.
Computes the percentage change from the immediately previous row by
@@ -9852,18 +10473,16 @@ def _check_percentile(self, q):
APPL NaN 0.337604 0.012002
"""
- @Appender(_shared_docs['pct_change'] % _shared_doc_kwargs)
- def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
- **kwargs):
+ @Appender(_shared_docs["pct_change"] % _shared_doc_kwargs)
+ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs):
# TODO: Not sure if above is correct - need someone to confirm.
- axis = self._get_axis_number(kwargs.pop('axis', self._stat_axis_name))
+ axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name))
if fill_method is None:
data = self
else:
data = self.fillna(method=fill_method, limit=limit, axis=axis)
- rs = (data.div(data.shift(periods=periods, freq=freq, axis=axis,
- **kwargs)) - 1)
+ rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1
rs = rs.reindex_like(data)
if freq is None:
mask = isna(com.values_from_object(data))
@@ -9890,16 +10509,40 @@ def _add_numeric_operations(cls):
axis_descr, name, name2 = _doc_parms(cls)
cls.any = _make_logical_function(
- cls, 'any', name, name2, axis_descr, _any_desc, nanops.nanany,
- _any_see_also, _any_examples, empty_value=False)
+ cls,
+ "any",
+ name,
+ name2,
+ axis_descr,
+ _any_desc,
+ nanops.nanany,
+ _any_see_also,
+ _any_examples,
+ empty_value=False,
+ )
cls.all = _make_logical_function(
- cls, 'all', name, name2, axis_descr, _all_desc, nanops.nanall,
- _all_see_also, _all_examples, empty_value=True)
-
- @Substitution(desc="Return the mean absolute deviation of the values "
- "for the requested axis.",
- name1=name, name2=name2, axis_descr=axis_descr,
- min_count='', see_also='', examples='')
+ cls,
+ "all",
+ name,
+ name2,
+ axis_descr,
+ _all_desc,
+ nanops.nanall,
+ _all_see_also,
+ _all_examples,
+ empty_value=True,
+ )
+
+ @Substitution(
+ desc="Return the mean absolute deviation of the values "
+ "for the requested axis.",
+ name1=name,
+ name2=name2,
+ axis_descr=axis_descr,
+ min_count="",
+ see_also="",
+ examples="",
+ )
@Appender(_num_doc)
def mad(self, axis=None, skipna=None, level=None):
if skipna is None:
@@ -9907,8 +10550,7 @@ def mad(self, axis=None, skipna=None, level=None):
if axis is None:
axis = self._stat_axis_number
if level is not None:
- return self._agg_by_level('mad', axis=axis, level=level,
- skipna=skipna)
+ return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna)
data = self._get_numeric_data()
if axis == 0:
@@ -9920,31 +10562,54 @@ def mad(self, axis=None, skipna=None, level=None):
cls.mad = mad
cls.sem = _make_stat_function_ddof(
- cls, 'sem', name, name2, axis_descr,
+ cls,
+ "sem",
+ name,
+ name2,
+ axis_descr,
"Return unbiased standard error of the mean over requested "
"axis.\n\nNormalized by N-1 by default. This can be changed "
"using the ddof argument",
- nanops.nansem)
+ nanops.nansem,
+ )
cls.var = _make_stat_function_ddof(
- cls, 'var', name, name2, axis_descr,
+ cls,
+ "var",
+ name,
+ name2,
+ axis_descr,
"Return unbiased variance over requested axis.\n\nNormalized by "
"N-1 by default. This can be changed using the ddof argument",
- nanops.nanvar)
+ nanops.nanvar,
+ )
cls.std = _make_stat_function_ddof(
- cls, 'std', name, name2, axis_descr,
+ cls,
+ "std",
+ name,
+ name2,
+ axis_descr,
"Return sample standard deviation over requested axis."
"\n\nNormalized by N-1 by default. This can be changed using the "
"ddof argument",
- nanops.nanstd)
-
- @Substitution(desc="Return the compound percentage of the values for "
- "the requested axis.\n\n.. deprecated:: 0.25.0",
- name1=name, name2=name2, axis_descr=axis_descr,
- min_count='', see_also='', examples='')
+ nanops.nanstd,
+ )
+
+ @Substitution(
+ desc="Return the compound percentage of the values for "
+ "the requested axis.\n\n.. deprecated:: 0.25.0",
+ name1=name,
+ name2=name2,
+ axis_descr=axis_descr,
+ min_count="",
+ see_also="",
+ examples="",
+ )
@Appender(_num_doc)
def compound(self, axis=None, skipna=None, level=None):
- msg = ("The 'compound' method is deprecated and will be"
- "removed in a future version.")
+ msg = (
+ "The 'compound' method is deprecated and will be"
+ "removed in a future version."
+ )
warnings.warn(msg, FutureWarning, stacklevel=2)
if skipna is None:
skipna = True
@@ -9953,63 +10618,146 @@ def compound(self, axis=None, skipna=None, level=None):
cls.compound = compound
cls.cummin = _make_cum_function(
- cls, 'cummin', name, name2, axis_descr, "minimum",
- lambda y, axis: np.minimum.accumulate(y, axis), "min",
- np.inf, np.nan, _cummin_examples)
+ cls,
+ "cummin",
+ name,
+ name2,
+ axis_descr,
+ "minimum",
+ lambda y, axis: np.minimum.accumulate(y, axis),
+ "min",
+ np.inf,
+ np.nan,
+ _cummin_examples,
+ )
cls.cumsum = _make_cum_function(
- cls, 'cumsum', name, name2, axis_descr, "sum",
- lambda y, axis: y.cumsum(axis), "sum", 0.,
- np.nan, _cumsum_examples)
+ cls,
+ "cumsum",
+ name,
+ name2,
+ axis_descr,
+ "sum",
+ lambda y, axis: y.cumsum(axis),
+ "sum",
+ 0.0,
+ np.nan,
+ _cumsum_examples,
+ )
cls.cumprod = _make_cum_function(
- cls, 'cumprod', name, name2, axis_descr, "product",
- lambda y, axis: y.cumprod(axis), "prod", 1.,
- np.nan, _cumprod_examples)
+ cls,
+ "cumprod",
+ name,
+ name2,
+ axis_descr,
+ "product",
+ lambda y, axis: y.cumprod(axis),
+ "prod",
+ 1.0,
+ np.nan,
+ _cumprod_examples,
+ )
cls.cummax = _make_cum_function(
- cls, 'cummax', name, name2, axis_descr, "maximum",
- lambda y, axis: np.maximum.accumulate(y, axis), "max",
- -np.inf, np.nan, _cummax_examples)
+ cls,
+ "cummax",
+ name,
+ name2,
+ axis_descr,
+ "maximum",
+ lambda y, axis: np.maximum.accumulate(y, axis),
+ "max",
+ -np.inf,
+ np.nan,
+ _cummax_examples,
+ )
cls.sum = _make_min_count_stat_function(
- cls, 'sum', name, name2, axis_descr,
+ cls,
+ "sum",
+ name,
+ name2,
+ axis_descr,
"""Return the sum of the values for the requested axis.\n
This is equivalent to the method ``numpy.sum``.""",
- nanops.nansum, _stat_func_see_also, _sum_examples)
+ nanops.nansum,
+ _stat_func_see_also,
+ _sum_examples,
+ )
cls.mean = _make_stat_function(
- cls, 'mean', name, name2, axis_descr,
- 'Return the mean of the values for the requested axis.',
- nanops.nanmean)
+ cls,
+ "mean",
+ name,
+ name2,
+ axis_descr,
+ "Return the mean of the values for the requested axis.",
+ nanops.nanmean,
+ )
cls.skew = _make_stat_function(
- cls, 'skew', name, name2, axis_descr,
- 'Return unbiased skew over requested axis\nNormalized by N-1.',
- nanops.nanskew)
+ cls,
+ "skew",
+ name,
+ name2,
+ axis_descr,
+ "Return unbiased skew over requested axis\nNormalized by N-1.",
+ nanops.nanskew,
+ )
cls.kurt = _make_stat_function(
- cls, 'kurt', name, name2, axis_descr,
+ cls,
+ "kurt",
+ name,
+ name2,
+ axis_descr,
"Return unbiased kurtosis over requested axis using Fisher's "
"definition of\nkurtosis (kurtosis of normal == 0.0). Normalized "
"by N-1.",
- nanops.nankurt)
+ nanops.nankurt,
+ )
cls.kurtosis = cls.kurt
cls.prod = _make_min_count_stat_function(
- cls, 'prod', name, name2, axis_descr,
- 'Return the product of the values for the requested axis.',
- nanops.nanprod, examples=_prod_examples)
+ cls,
+ "prod",
+ name,
+ name2,
+ axis_descr,
+ "Return the product of the values for the requested axis.",
+ nanops.nanprod,
+ examples=_prod_examples,
+ )
cls.product = cls.prod
cls.median = _make_stat_function(
- cls, 'median', name, name2, axis_descr,
- 'Return the median of the values for the requested axis.',
- nanops.nanmedian)
+ cls,
+ "median",
+ name,
+ name2,
+ axis_descr,
+ "Return the median of the values for the requested axis.",
+ nanops.nanmedian,
+ )
cls.max = _make_stat_function(
- cls, 'max', name, name2, axis_descr,
+ cls,
+ "max",
+ name,
+ name2,
+ axis_descr,
"""Return the maximum of the values for the requested axis.\n
If you want the *index* of the maximum, use ``idxmax``. This is
the equivalent of the ``numpy.ndarray`` method ``argmax``.""",
- nanops.nanmax, _stat_func_see_also, _max_examples)
+ nanops.nanmax,
+ _stat_func_see_also,
+ _max_examples,
+ )
cls.min = _make_stat_function(
- cls, 'min', name, name2, axis_descr,
+ cls,
+ "min",
+ name,
+ name2,
+ axis_descr,
"""Return the minimum of the values for the requested axis.\n
If you want the *index* of the minimum, use ``idxmin``. This is
the equivalent of the ``numpy.ndarray`` method ``argmin``.""",
- nanops.nanmin, _stat_func_see_also, _min_examples)
+ nanops.nanmin,
+ _stat_func_see_also,
+ _min_examples,
+ )
@classmethod
def _add_series_only_operations(cls):
@@ -10023,18 +10771,26 @@ def _add_series_only_operations(cls):
def nanptp(values, axis=0, skipna=True):
nmax = nanops.nanmax(values, axis, skipna)
nmin = nanops.nanmin(values, axis, skipna)
- warnings.warn("Method .ptp is deprecated and will be removed "
- "in a future version. Use numpy.ptp instead.",
- FutureWarning, stacklevel=4)
+ warnings.warn(
+ "Method .ptp is deprecated and will be removed "
+ "in a future version. Use numpy.ptp instead.",
+ FutureWarning,
+ stacklevel=4,
+ )
return nmax - nmin
cls.ptp = _make_stat_function(
- cls, 'ptp', name, name2, axis_descr,
+ cls,
+ "ptp",
+ name,
+ name2,
+ axis_descr,
"""Return the difference between the maximum value and the
minimum value in the object. This is the equivalent of the
``numpy.ndarray`` method ``ptp``.\n\n.. deprecated:: 0.24.0
Use numpy.ptp instead""",
- nanptp)
+ nanptp,
+ )
@classmethod
def _add_series_or_dataframe_operations(cls):
@@ -10046,48 +10802,80 @@ def _add_series_or_dataframe_operations(cls):
from pandas.core import window as rwindow
@Appender(rwindow.rolling.__doc__)
- def rolling(self, window, min_periods=None, center=False,
- win_type=None, on=None, axis=0, closed=None):
+ def rolling(
+ self,
+ window,
+ min_periods=None,
+ center=False,
+ win_type=None,
+ on=None,
+ axis=0,
+ closed=None,
+ ):
axis = self._get_axis_number(axis)
- return rwindow.rolling(self, window=window,
- min_periods=min_periods,
- center=center, win_type=win_type,
- on=on, axis=axis, closed=closed)
+ return rwindow.rolling(
+ self,
+ window=window,
+ min_periods=min_periods,
+ center=center,
+ win_type=win_type,
+ on=on,
+ axis=axis,
+ closed=closed,
+ )
cls.rolling = rolling
@Appender(rwindow.expanding.__doc__)
def expanding(self, min_periods=1, center=False, axis=0):
axis = self._get_axis_number(axis)
- return rwindow.expanding(self, min_periods=min_periods,
- center=center, axis=axis)
+ return rwindow.expanding(
+ self, min_periods=min_periods, center=center, axis=axis
+ )
cls.expanding = expanding
@Appender(rwindow.ewm.__doc__)
- def ewm(self, com=None, span=None, halflife=None, alpha=None,
- min_periods=0, adjust=True, ignore_na=False,
- axis=0):
+ def ewm(
+ self,
+ com=None,
+ span=None,
+ halflife=None,
+ alpha=None,
+ min_periods=0,
+ adjust=True,
+ ignore_na=False,
+ axis=0,
+ ):
axis = self._get_axis_number(axis)
- return rwindow.ewm(self, com=com, span=span, halflife=halflife,
- alpha=alpha, min_periods=min_periods,
- adjust=adjust, ignore_na=ignore_na, axis=axis)
+ return rwindow.ewm(
+ self,
+ com=com,
+ span=span,
+ halflife=halflife,
+ alpha=alpha,
+ min_periods=min_periods,
+ adjust=adjust,
+ ignore_na=ignore_na,
+ axis=axis,
+ )
cls.ewm = ewm
- @Appender(_shared_docs['transform'] % dict(axis="", **_shared_doc_kwargs))
+ @Appender(_shared_docs["transform"] % dict(axis="", **_shared_doc_kwargs))
def transform(self, func, *args, **kwargs):
result = self.agg(func, *args, **kwargs)
if is_scalar(result) or len(result) != len(self):
- raise ValueError("transforms cannot produce "
- "aggregated results")
+ raise ValueError("transforms cannot produce " "aggregated results")
return result
# ----------------------------------------------------------------------
# Misc methods
- _shared_docs['valid_index'] = """
+ _shared_docs[
+ "valid_index"
+ ] = """
Return index for %(position)s non-NA/null value.
Returns
@@ -10113,7 +10901,7 @@ def _find_valid_index(self, how):
-------
idx_first_valid : type of index
"""
- assert how in ['first', 'last']
+ assert how in ["first", "last"]
if len(self) == 0: # early stop
return None
@@ -10122,10 +10910,10 @@ def _find_valid_index(self, how):
if self.ndim == 2:
is_valid = is_valid.any(1) # reduce axis 1
- if how == 'first':
+ if how == "first":
idxpos = is_valid.values[::].argmax()
- if how == 'last':
+ if how == "last":
idxpos = len(self) - 1 - is_valid.values[::-1].argmax()
chk_notna = is_valid.iat[idxpos]
@@ -10135,23 +10923,25 @@ def _find_valid_index(self, how):
return None
return idx
- @Appender(_shared_docs['valid_index'] % {'position': 'first',
- 'klass': 'Series/DataFrame'})
+ @Appender(
+ _shared_docs["valid_index"] % {"position": "first", "klass": "Series/DataFrame"}
+ )
def first_valid_index(self):
- return self._find_valid_index('first')
+ return self._find_valid_index("first")
- @Appender(_shared_docs['valid_index'] % {'position': 'last',
- 'klass': 'Series/DataFrame'})
+ @Appender(
+ _shared_docs["valid_index"] % {"position": "last", "klass": "Series/DataFrame"}
+ )
def last_valid_index(self):
- return self._find_valid_index('last')
+ return self._find_valid_index("last")
def _doc_parms(cls):
"""Return a tuple of the doc parms."""
- axis_descr = "{%s}" % ', '.join("{0} ({1})".format(a, i)
- for i, a in enumerate(cls._AXIS_ORDERS))
- name = (cls._constructor_sliced.__name__
- if cls._AXIS_LEN > 1 else 'scalar')
+ axis_descr = "{%s}" % ", ".join(
+ "{0} ({1})".format(a, i) for i, a in enumerate(cls._AXIS_ORDERS)
+ )
+ name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar"
name2 = cls.__name__
return axis_descr, name, name2
@@ -10684,7 +11474,9 @@ def _doc_parms(cls):
Series([], dtype: bool)
"""
-_shared_docs['stat_func_example'] = """
+_shared_docs[
+ "stat_func_example"
+] = """
Examples
--------
@@ -10718,12 +11510,9 @@ def _doc_parms(cls):
cold {level_output_1}
Name: legs, dtype: int64"""
-_sum_examples = _shared_docs['stat_func_example'].format(
- stat_func='sum',
- verb='Sum',
- default_output=14,
- level_output_0=6,
- level_output_1=8)
+_sum_examples = _shared_docs["stat_func_example"].format(
+ stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8
+)
_sum_examples += """
@@ -10747,19 +11536,13 @@ def _doc_parms(cls):
>>> pd.Series([np.nan]).sum(min_count=1)
nan"""
-_max_examples = _shared_docs['stat_func_example'].format(
- stat_func='max',
- verb='Max',
- default_output=8,
- level_output_0=4,
- level_output_1=8)
+_max_examples = _shared_docs["stat_func_example"].format(
+ stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8
+)
-_min_examples = _shared_docs['stat_func_example'].format(
- stat_func='min',
- verb='Min',
- default_output=0,
- level_output_0=2,
- level_output_1=0)
+_min_examples = _shared_docs["stat_func_example"].format(
+ stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0
+)
_stat_func_see_also = """
@@ -10812,18 +11595,31 @@ def _doc_parms(cls):
"""
-def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc,
- f, see_also='', examples=''):
- @Substitution(desc=desc, name1=name1, name2=name2,
- axis_descr=axis_descr, min_count=_min_count_stub,
- see_also=see_also, examples=examples)
+def _make_min_count_stat_function(
+ cls, name, name1, name2, axis_descr, desc, f, see_also="", examples=""
+):
+ @Substitution(
+ desc=desc,
+ name1=name1,
+ name2=name2,
+ axis_descr=axis_descr,
+ min_count=_min_count_stub,
+ see_also=see_also,
+ examples=examples,
+ )
@Appender(_num_doc)
- def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
- min_count=0,
- **kwargs):
- if name == 'sum':
+ def stat_func(
+ self,
+ axis=None,
+ skipna=None,
+ level=None,
+ numeric_only=None,
+ min_count=0,
+ **kwargs
+ ):
+ if name == "sum":
nv.validate_sum(tuple(), kwargs)
- elif name == 'prod':
+ elif name == "prod":
nv.validate_prod(tuple(), kwargs)
else:
nv.validate_stat_func(tuple(), kwargs, fname=name)
@@ -10832,23 +11628,38 @@ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
if axis is None:
axis = self._stat_axis_number
if level is not None:
- return self._agg_by_level(name, axis=axis, level=level,
- skipna=skipna, min_count=min_count)
- return self._reduce(f, name, axis=axis, skipna=skipna,
- numeric_only=numeric_only, min_count=min_count)
+ return self._agg_by_level(
+ name, axis=axis, level=level, skipna=skipna, min_count=min_count
+ )
+ return self._reduce(
+ f,
+ name,
+ axis=axis,
+ skipna=skipna,
+ numeric_only=numeric_only,
+ min_count=min_count,
+ )
return set_function_name(stat_func, name, cls)
-def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f,
- see_also='', examples=''):
- @Substitution(desc=desc, name1=name1, name2=name2,
- axis_descr=axis_descr, min_count='', see_also=see_also,
- examples=examples)
+def _make_stat_function(
+ cls, name, name1, name2, axis_descr, desc, f, see_also="", examples=""
+):
+ @Substitution(
+ desc=desc,
+ name1=name1,
+ name2=name2,
+ axis_descr=axis_descr,
+ min_count="",
+ see_also=see_also,
+ examples=examples,
+ )
@Appender(_num_doc)
- def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
- **kwargs):
- if name == 'median':
+ def stat_func(
+ self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
+ ):
+ if name == "median":
nv.validate_median(tuple(), kwargs)
else:
nv.validate_stat_func(tuple(), kwargs, fname=name)
@@ -10857,39 +11668,57 @@ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
if axis is None:
axis = self._stat_axis_number
if level is not None:
- return self._agg_by_level(name, axis=axis, level=level,
- skipna=skipna)
- return self._reduce(f, name, axis=axis, skipna=skipna,
- numeric_only=numeric_only)
+ return self._agg_by_level(name, axis=axis, level=level, skipna=skipna)
+ return self._reduce(
+ f, name, axis=axis, skipna=skipna, numeric_only=numeric_only
+ )
return set_function_name(stat_func, name, cls)
def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f):
- @Substitution(desc=desc, name1=name1, name2=name2,
- axis_descr=axis_descr)
+ @Substitution(desc=desc, name1=name1, name2=name2, axis_descr=axis_descr)
@Appender(_num_ddof_doc)
- def stat_func(self, axis=None, skipna=None, level=None, ddof=1,
- numeric_only=None, **kwargs):
+ def stat_func(
+ self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
+ ):
nv.validate_stat_ddof_func(tuple(), kwargs, fname=name)
if skipna is None:
skipna = True
if axis is None:
axis = self._stat_axis_number
if level is not None:
- return self._agg_by_level(name, axis=axis, level=level,
- skipna=skipna, ddof=ddof)
- return self._reduce(f, name, axis=axis, numeric_only=numeric_only,
- skipna=skipna, ddof=ddof)
+ return self._agg_by_level(
+ name, axis=axis, level=level, skipna=skipna, ddof=ddof
+ )
+ return self._reduce(
+ f, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof
+ )
return set_function_name(stat_func, name, cls)
-def _make_cum_function(cls, name, name1, name2, axis_descr, desc,
- accum_func, accum_func_name, mask_a, mask_b, examples):
- @Substitution(desc=desc, name1=name1, name2=name2,
- axis_descr=axis_descr, accum_func_name=accum_func_name,
- examples=examples)
+def _make_cum_function(
+ cls,
+ name,
+ name1,
+ name2,
+ axis_descr,
+ desc,
+ accum_func,
+ accum_func_name,
+ mask_a,
+ mask_b,
+ examples,
+):
+ @Substitution(
+ desc=desc,
+ name1=name1,
+ name2=name2,
+ axis_descr=axis_descr,
+ accum_func_name=accum_func_name,
+ examples=examples,
+ )
@Appender(_cnum_doc)
def cum_func(self, axis=None, skipna=True, *args, **kwargs):
skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name)
@@ -10900,8 +11729,7 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs):
y = com.values_from_object(self).copy()
- if (skipna and
- issubclass(y.dtype.type, (np.datetime64, np.timedelta64))):
+ if skipna and issubclass(y.dtype.type, (np.datetime64, np.timedelta64)):
result = accum_func(y, axis)
mask = isna(self)
np.putmask(result, mask, iNaT)
@@ -10914,29 +11742,41 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs):
result = accum_func(y, axis)
d = self._construct_axes_dict()
- d['copy'] = False
+ d["copy"] = False
return self._constructor(result, **d).__finalize__(self)
return set_function_name(cum_func, name, cls)
-def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f,
- see_also, examples, empty_value):
- @Substitution(desc=desc, name1=name1, name2=name2,
- axis_descr=axis_descr, see_also=see_also, examples=examples,
- empty_value=empty_value)
+def _make_logical_function(
+ cls, name, name1, name2, axis_descr, desc, f, see_also, examples, empty_value
+):
+ @Substitution(
+ desc=desc,
+ name1=name1,
+ name2=name2,
+ axis_descr=axis_descr,
+ see_also=see_also,
+ examples=examples,
+ empty_value=empty_value,
+ )
@Appender(_bool_doc)
- def logical_func(self, axis=0, bool_only=None, skipna=True, level=None,
- **kwargs):
+ def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
nv.validate_logical_func(tuple(), kwargs, fname=name)
if level is not None:
if bool_only is not None:
- raise NotImplementedError("Option bool_only is not "
- "implemented with option level.")
- return self._agg_by_level(name, axis=axis, level=level,
- skipna=skipna)
- return self._reduce(f, name, axis=axis, skipna=skipna,
- numeric_only=bool_only, filter_type='bool')
+ raise NotImplementedError(
+ "Option bool_only is not " "implemented with option level."
+ )
+ return self._agg_by_level(name, axis=axis, level=level, skipna=skipna)
+ return self._reduce(
+ f,
+ name,
+ axis=axis,
+ skipna=skipna,
+ numeric_only=bool_only,
+ filter_type="bool",
+ )
return set_function_name(logical_func, name, cls)
diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py
index fe50bd91a4f56..252f20ed40068 100644
--- a/pandas/core/groupby/__init__.py
+++ b/pandas/core/groupby/__init__.py
@@ -1,4 +1,7 @@
from pandas.core.groupby.generic import ( # noqa: F401
- DataFrameGroupBy, NamedAgg, SeriesGroupBy)
+ DataFrameGroupBy,
+ NamedAgg,
+ SeriesGroupBy,
+)
from pandas.core.groupby.groupby import GroupBy # noqa: F401
from pandas.core.groupby.grouper import Grouper # noqa: F401
diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py
index cffe0e589c6bc..5c4f1fa3fbddf 100644
--- a/pandas/core/groupby/base.py
+++ b/pandas/core/groupby/base.py
@@ -21,7 +21,9 @@ def outer(self, *args, **kwargs):
def f(x):
x = self._shallow_copy(x, groupby=self._groupby)
return getattr(x, name)(*args, **kwargs)
+
return self._groupby.apply(f)
+
outer.__name__ = name
return outer
@@ -51,10 +53,7 @@ def _gotitem(self, key, ndim, subset=None):
except IndexError:
groupby = self._groupby
- self = self.__class__(subset,
- groupby=groupby,
- parent=self,
- **kwargs)
+ self = self.__class__(subset, groupby=groupby, parent=self, **kwargs)
self._reset_cache()
if subset.ndim == 2:
if is_scalar(key) and key in subset or is_list_like(key):
@@ -64,25 +63,41 @@ def _gotitem(self, key, ndim, subset=None):
# special case to prevent duplicate plots when catching exceptions when
# forwarding methods from NDFrames
-plotting_methods = frozenset(['plot', 'hist'])
-
-common_apply_whitelist = frozenset([
- 'quantile', 'fillna', 'mad', 'take',
- 'idxmax', 'idxmin', 'tshift',
- 'skew', 'corr', 'cov', 'diff'
-]) | plotting_methods
-
-series_apply_whitelist = ((common_apply_whitelist |
- {'nlargest', 'nsmallest',
- 'is_monotonic_increasing',
- 'is_monotonic_decreasing'})
- ) | frozenset(['dtype', 'unique'])
-
-dataframe_apply_whitelist = ((common_apply_whitelist |
- frozenset(['dtypes', 'corrwith'])))
-
-cython_transforms = frozenset(['cumprod', 'cumsum', 'shift',
- 'cummin', 'cummax'])
-
-cython_cast_blacklist = frozenset(['rank', 'count', 'size', 'idxmin',
- 'idxmax'])
+plotting_methods = frozenset(["plot", "hist"])
+
+common_apply_whitelist = (
+ frozenset(
+ [
+ "quantile",
+ "fillna",
+ "mad",
+ "take",
+ "idxmax",
+ "idxmin",
+ "tshift",
+ "skew",
+ "corr",
+ "cov",
+ "diff",
+ ]
+ )
+ | plotting_methods
+)
+
+series_apply_whitelist = (
+ (
+ common_apply_whitelist
+ | {
+ "nlargest",
+ "nsmallest",
+ "is_monotonic_increasing",
+ "is_monotonic_decreasing",
+ }
+ )
+) | frozenset(["dtype", "unique"])
+
+dataframe_apply_whitelist = common_apply_whitelist | frozenset(["dtypes", "corrwith"])
+
+cython_transforms = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"])
+
+cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"])
diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py
index 85f51323a97b5..fcf52ecfcbbcd 100644
--- a/pandas/core/groupby/categorical.py
+++ b/pandas/core/groupby/categorical.py
@@ -2,7 +2,10 @@
from pandas.core.algorithms import unique1d
from pandas.core.arrays.categorical import (
- Categorical, CategoricalDtype, _recode_for_categories)
+ Categorical,
+ CategoricalDtype,
+ _recode_for_categories,
+)
def recode_for_groupby(c, sort, observed):
@@ -49,9 +52,7 @@ def recode_for_groupby(c, sort, observed):
# we recode according to the uniques
categories = c.categories.take(take_codes)
- codes = _recode_for_categories(c.codes,
- c.categories,
- categories)
+ codes = _recode_for_categories(c.codes, c.categories, categories)
# return a new categorical that maps our new codes
# and categories
@@ -68,8 +69,7 @@ def recode_for_groupby(c, sort, observed):
# But for groupby to work, all categories should be present,
# including those missing from the data (GH-13179), which .unique()
# above dropped
- cat = cat.add_categories(
- c.categories[~c.categories.isin(cat.categories)])
+ cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)])
return c.reorder_categories(cat.categories), None
@@ -96,5 +96,4 @@ def recode_from_groupby(c, sort, ci):
return ci.set_categories(c.categories)
# we are not sorting, so add unobserved to the end
- return ci.add_categories(
- c.categories[~c.categories.isin(ci.categories)])
+ return ci.add_categories(c.categories[~c.categories.isin(ci.categories)])
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 9e7dcafc0b1a4..7fd0ca94e7997 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -21,12 +21,20 @@
from pandas.errors import AbstractMethodError
from pandas.util._decorators import Appender, Substitution
-from pandas.core.dtypes.cast import (
- maybe_convert_objects, maybe_downcast_to_dtype)
+from pandas.core.dtypes.cast import maybe_convert_objects, maybe_downcast_to_dtype
from pandas.core.dtypes.common import (
- ensure_int64, ensure_platform_int, is_bool, is_datetimelike, is_dict_like,
- is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype,
- is_object_dtype, is_scalar)
+ ensure_int64,
+ ensure_platform_int,
+ is_bool,
+ is_datetimelike,
+ is_dict_like,
+ is_integer_dtype,
+ is_interval_dtype,
+ is_list_like,
+ is_numeric_dtype,
+ is_object_dtype,
+ is_scalar,
+)
from pandas.core.dtypes.missing import isna, notna
from pandas._typing import FrameOrSeries
@@ -36,8 +44,7 @@
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.groupby import base
-from pandas.core.groupby.groupby import (
- GroupBy, _apply_docs, _transform_template)
+from pandas.core.groupby.groupby import GroupBy, _apply_docs, _transform_template
from pandas.core.index import Index, MultiIndex
import pandas.core.indexes.base as ibase
from pandas.core.internals import BlockManager, make_block
@@ -55,10 +62,9 @@
ScalarResult = typing.TypeVar("ScalarResult")
-def whitelist_method_generator(base_class: Type[GroupBy],
- klass: Type[FrameOrSeries],
- whitelist: FrozenSet[str],
- ) -> Iterator[str]:
+def whitelist_method_generator(
+ base_class: Type[GroupBy], klass: Type[FrameOrSeries], whitelist: FrozenSet[str]
+) -> Iterator[str]:
"""
Yields all GroupBy member defs for DataFrame/Series names in whitelist.
@@ -80,8 +86,7 @@ class where members are defined.
Since we don't want to override methods explicitly defined in the
base class, any such name is skipped.
"""
- property_wrapper_template = \
- """@property
+ property_wrapper_template = """@property
def %(name)s(self) :
\"""%(doc)s\"""
return self.__getattr__('%(name)s')"""
@@ -94,14 +99,13 @@ def %(name)s(self) :
# ugly, but we need the name string itself in the method.
f = getattr(klass, name)
doc = f.__doc__
- doc = doc if type(doc) == str else ''
+ doc = doc if type(doc) == str else ""
wrapper_template = property_wrapper_template
- params = {'name': name, 'doc': doc}
+ params = {"name": name, "doc": doc}
yield wrapper_template % params
class NDFrameGroupBy(GroupBy):
-
def _iterate_slices(self):
if self.axis == 0:
# kludge
@@ -119,16 +123,15 @@ def _iterate_slices(self):
continue
yield val, slicer(val)
- def _cython_agg_general(self, how, alt=None, numeric_only=True,
- min_count=-1):
+ def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1):
new_items, new_blocks = self._cython_agg_blocks(
- how, alt=alt, numeric_only=numeric_only, min_count=min_count)
+ how, alt=alt, numeric_only=numeric_only, min_count=min_count
+ )
return self._wrap_agged_blocks(new_items, new_blocks)
_block_agg_axis = 0
- def _cython_agg_blocks(self, how, alt=None, numeric_only=True,
- min_count=-1):
+ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1):
# TODO: the actual managing of mgr_locs is a PITA
# here, it should happen via BlockManager.combine
@@ -145,7 +148,8 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True,
locs = block.mgr_locs.as_array
try:
result, _ = self.grouper.aggregate(
- block.values, how, axis=agg_axis, min_count=min_count)
+ block.values, how, axis=agg_axis, min_count=min_count
+ )
except NotImplementedError:
# generally if we have numeric_only=False
# and non-applicable functions
@@ -181,7 +185,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True,
new_blocks.append(newb)
if len(new_blocks) == 0:
- raise DataError('No numeric types to aggregate')
+ raise DataError("No numeric types to aggregate")
# reset the locs in the blocks to correspond to our
# current ordering
@@ -203,13 +207,13 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True,
offset = 0
for b in new_blocks:
loc = len(b.mgr_locs)
- b.mgr_locs = indexer[offset:(offset + loc)]
+ b.mgr_locs = indexer[offset : (offset + loc)]
offset += loc
return new_items, new_blocks
def aggregate(self, func, *args, **kwargs):
- _level = kwargs.pop('_level', None)
+ _level = kwargs.pop("_level", None)
relabeling = func is None and _is_multi_agg_with_relabel(**kwargs)
if relabeling:
@@ -218,8 +222,7 @@ def aggregate(self, func, *args, **kwargs):
kwargs = {}
elif func is None:
# nicer error message
- raise TypeError("Must provide 'func' or tuples of "
- "'(column, aggfunc).")
+ raise TypeError("Must provide 'func' or tuples of " "'(column, aggfunc).")
func = _maybe_mangle_lambdas(func)
@@ -238,11 +241,12 @@ def aggregate(self, func, *args, **kwargs):
try:
assert not args and not kwargs
result = self._aggregate_multiple_funcs(
- [func], _level=_level, _axis=self.axis)
+ [func], _level=_level, _axis=self.axis
+ )
result.columns = Index(
- result.columns.levels[0],
- name=self._selected_obj.columns.name)
+ result.columns.levels[0], name=self._selected_obj.columns.name
+ )
if isinstance(self.obj, SparseDataFrame):
# Backwards compat for groupby.agg() with sparse
@@ -266,7 +270,7 @@ def aggregate(self, func, *args, **kwargs):
def _aggregate_generic(self, func, *args, **kwargs):
if self.grouper.nkeys != 1:
- raise AssertionError('Number of keys must be 1')
+ raise AssertionError("Number of keys must be 1")
axis = self.axis
obj = self._obj_with_exclusions
@@ -275,16 +279,14 @@ def _aggregate_generic(self, func, *args, **kwargs):
if axis != obj._info_axis_number:
try:
for name, data in self:
- result[name] = self._try_cast(func(data, *args, **kwargs),
- data)
+ result[name] = self._try_cast(func(data, *args, **kwargs), data)
except Exception:
return self._aggregate_item_by_item(func, *args, **kwargs)
else:
for name in self.indices:
try:
data = self.get_group(name, obj=obj)
- result[name] = self._try_cast(func(data, *args, **kwargs),
- data)
+ result[name] = self._try_cast(func(data, *args, **kwargs), data)
except Exception:
wrapper = lambda x: func(x, *args, **kwargs)
result[name] = data.apply(wrapper, axis=axis)
@@ -304,8 +306,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs):
for item in obj:
try:
data = obj[item]
- colg = SeriesGroupBy(data, selection=item,
- grouper=self.grouper)
+ colg = SeriesGroupBy(data, selection=item, grouper=self.grouper)
cast = self._transform_should_cast(func)
@@ -342,8 +343,7 @@ def _decide_output_index(self, output, labels):
pass
if isinstance(labels, MultiIndex):
- output_keys = MultiIndex.from_tuples(output_keys,
- names=labels.names)
+ output_keys = MultiIndex.from_tuples(output_keys, names=labels.names)
return output_keys
@@ -369,8 +369,7 @@ def first_not_none(values):
# We'd prefer it return an empty dataframe.
return DataFrame()
elif isinstance(v, DataFrame):
- return self._concat_objects(keys, values,
- not_indexed_same=not_indexed_same)
+ return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
elif self.grouper.groupings is not None:
if len(self.grouper.groupings) > 1:
key_index = self.grouper.result_index
@@ -400,8 +399,7 @@ def first_not_none(values):
return DataFrame()
elif isinstance(v, NDFrame):
values = [
- x if x is not None else
- v._constructor(**v._construct_axes_dict())
+ x if x is not None else v._constructor(**v._construct_axes_dict())
for x in values
]
@@ -410,11 +408,8 @@ def first_not_none(values):
if isinstance(v, (np.ndarray, Index, Series)):
if isinstance(v, Series):
applied_index = self._selected_obj._get_axis(self.axis)
- all_indexed_same = _all_indexes_same([
- x.index for x in values
- ])
- singular_series = (len(values) == 1 and
- applied_index.nlevels == 1)
+ all_indexed_same = _all_indexes_same([x.index for x in values])
+ singular_series = len(values) == 1 and applied_index.nlevels == 1
# GH3596
# provide a reduction (Frame -> Series) if groups are
@@ -438,13 +433,12 @@ def first_not_none(values):
# path added as of GH 5545
elif all_indexed_same:
from pandas.core.reshape.concat import concat
+
return concat(values)
if not all_indexed_same:
# GH 8467
- return self._concat_objects(
- keys, values, not_indexed_same=True,
- )
+ return self._concat_objects(keys, values, not_indexed_same=True)
try:
if self.axis == 0:
@@ -462,33 +456,37 @@ def first_not_none(values):
# normally use vstack as its faster than concat
# and if we have mi-columns
- if (isinstance(v.index, MultiIndex) or
- key_index is None or
- isinstance(key_index, MultiIndex)):
- stacked_values = np.vstack([
- np.asarray(v) for v in values
- ])
- result = DataFrame(stacked_values, index=key_index,
- columns=index)
+ if (
+ isinstance(v.index, MultiIndex)
+ or key_index is None
+ or isinstance(key_index, MultiIndex)
+ ):
+ stacked_values = np.vstack([np.asarray(v) for v in values])
+ result = DataFrame(
+ stacked_values, index=key_index, columns=index
+ )
else:
# GH5788 instead of stacking; concat gets the
# dtypes correct
from pandas.core.reshape.concat import concat
- result = concat(values, keys=key_index,
- names=key_index.names,
- axis=self.axis).unstack()
+
+ result = concat(
+ values,
+ keys=key_index,
+ names=key_index.names,
+ axis=self.axis,
+ ).unstack()
result.columns = index
else:
- stacked_values = np.vstack([np.asarray(v)
- for v in values])
- result = DataFrame(stacked_values.T, index=v.index,
- columns=key_index)
+ stacked_values = np.vstack([np.asarray(v) for v in values])
+ result = DataFrame(
+ stacked_values.T, index=v.index, columns=key_index
+ )
except (ValueError, AttributeError):
# GH1738: values is list of arrays of unequal lengths fall
# through to the outer else caluse
- return Series(values, index=key_index,
- name=self._selection_name)
+ return Series(values, index=key_index, name=self._selection_name)
# if we have date/time like in the original, then coerce dates
# as we are stacking can easily have object dtypes here
@@ -507,14 +505,13 @@ def first_not_none(values):
# self._selection_name not passed through to Series as the
# result should not take the name of original selection
# of columns
- return (Series(values, index=key_index)
- ._convert(datetime=True,
- coerce=coerce))
+ return Series(values, index=key_index)._convert(
+ datetime=True, coerce=coerce
+ )
else:
# Handle cases like BinGrouper
- return self._concat_objects(keys, values,
- not_indexed_same=not_indexed_same)
+ return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
def _transform_general(self, func, *args, **kwargs):
from pandas.core.reshape.concat import concat
@@ -526,7 +523,7 @@ def _transform_general(self, func, *args, **kwargs):
path = None
for name, group in gen:
- object.__setattr__(group, 'name', name)
+ object.__setattr__(group, "name", name)
if path is None:
# Try slow path and fast path.
@@ -535,7 +532,7 @@ def _transform_general(self, func, *args, **kwargs):
except TypeError:
return self._transform_item_by_item(obj, fast_path)
except ValueError:
- msg = 'transform must return a scalar value for each group'
+ msg = "transform must return a scalar value for each group"
raise ValueError(msg)
else:
res = path(group)
@@ -553,9 +550,12 @@ def _transform_general(self, func, *args, **kwargs):
r.index = group.index
else:
r = DataFrame(
- np.concatenate([res.values] * len(group.index)
- ).reshape(group.shape),
- columns=group.columns, index=group.index)
+ np.concatenate([res.values] * len(group.index)).reshape(
+ group.shape
+ ),
+ columns=group.columns,
+ index=group.index,
+ )
applied.append(r)
else:
@@ -564,11 +564,10 @@ def _transform_general(self, func, *args, **kwargs):
concat_index = obj.columns if self.axis == 0 else obj.index
other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
concatenated = concat(applied, axis=self.axis, verify_integrity=False)
- concatenated = concatenated.reindex(concat_index, axis=other_axis,
- copy=False)
+ concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
return self._set_result_index_ordered(concatenated)
- @Substitution(klass='DataFrame', selected='')
+ @Substitution(klass="DataFrame", selected="")
@Appender(_transform_template)
def transform(self, func, *args, **kwargs):
@@ -614,18 +613,19 @@ def _transform_fast(self, result, obj, func_nm):
res = self._try_cast(res, obj.iloc[:, i])
output.append(res)
- return DataFrame._from_arrays(output, columns=result.columns,
- index=obj.index)
+ return DataFrame._from_arrays(output, columns=result.columns, index=obj.index)
def _define_paths(self, func, *args, **kwargs):
if isinstance(func, str):
fast_path = lambda group: getattr(group, func)(*args, **kwargs)
slow_path = lambda group: group.apply(
- lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis)
+ lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
+ )
else:
fast_path = lambda group: func(group, *args, **kwargs)
slow_path = lambda group: group.apply(
- lambda x: func(x, *args, **kwargs), axis=self.axis)
+ lambda x: func(x, *args, **kwargs), axis=self.axis
+ )
return fast_path, slow_path
def _choose_path(self, fast_path, slow_path, group):
@@ -663,7 +663,7 @@ def _transform_item_by_item(self, obj, wrapper):
pass
if len(output) == 0: # pragma: no cover
- raise TypeError('Transform function invalid for data types')
+ raise TypeError("Transform function invalid for data types")
columns = obj.columns
if len(output) < len(obj.columns):
@@ -712,7 +712,7 @@ def filter(self, func, dropna=True, *args, **kwargs): # noqa
gen = self.grouper.get_iterator(obj, axis=self.axis)
for name, group in gen:
- object.__setattr__(group, 'name', name)
+ object.__setattr__(group, "name", name)
res = func(group, *args, **kwargs)
@@ -727,9 +727,10 @@ def filter(self, func, dropna=True, *args, **kwargs): # noqa
indices.append(self._get_index(name))
else:
# non scalars aren't allowed
- raise TypeError("filter function returned a %s, "
- "but expected a scalar bool" %
- type(res).__name__)
+ raise TypeError(
+ "filter function returned a %s, "
+ "but expected a scalar bool" % type(res).__name__
+ )
return self._apply_filter(indices, dropna)
@@ -739,8 +740,7 @@ class SeriesGroupBy(GroupBy):
# Make class defs of attributes on SeriesGroupBy whitelist
_apply_whitelist = base.series_apply_whitelist
- for _def_str in whitelist_method_generator(
- GroupBy, Series, _apply_whitelist):
+ for _def_str in whitelist_method_generator(GroupBy, Series, _apply_whitelist):
exec(_def_str)
@property
@@ -755,15 +755,18 @@ def _selection_name(self):
else:
return self._selection
- _agg_see_also_doc = dedent("""
+ _agg_see_also_doc = dedent(
+ """
See Also
--------
pandas.Series.groupby.apply
pandas.Series.groupby.transform
pandas.Series.aggregate
- """)
+ """
+ )
- _agg_examples_doc = dedent("""
+ _agg_examples_doc = dedent(
+ """
Examples
--------
>>> s = pd.Series([1, 2, 3, 4])
@@ -800,27 +803,33 @@ def _selection_name(self):
minimum maximum
1 1 2
2 3 4
- """)
+ """
+ )
- @Appender(_apply_docs['template']
- .format(input='series',
- examples=_apply_docs['series_examples']))
+ @Appender(
+ _apply_docs["template"].format(
+ input="series", examples=_apply_docs["series_examples"]
+ )
+ )
def apply(self, func, *args, **kwargs):
return super().apply(func, *args, **kwargs)
- @Substitution(see_also=_agg_see_also_doc,
- examples=_agg_examples_doc,
- versionadded='',
- klass='Series',
- axis='')
- @Appender(_shared_docs['aggregate'])
+ @Substitution(
+ see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded="",
+ klass="Series",
+ axis="",
+ )
+ @Appender(_shared_docs["aggregate"])
def aggregate(self, func_or_funcs=None, *args, **kwargs):
- _level = kwargs.pop('_level', None)
+ _level = kwargs.pop("_level", None)
relabeling = func_or_funcs is None
columns = None
- no_arg_message = ("Must provide 'func_or_funcs' or named "
- "aggregation **kwargs.")
+ no_arg_message = (
+ "Must provide 'func_or_funcs' or named " "aggregation **kwargs."
+ )
if relabeling:
columns = list(kwargs)
if not PY36:
@@ -839,8 +848,7 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs):
# Catch instances of lists / tuples
# but not the class list / tuple itself.
func_or_funcs = _maybe_mangle_lambdas(func_or_funcs)
- ret = self._aggregate_multiple_funcs(func_or_funcs,
- (_level or 0) + 1)
+ ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1)
if relabeling:
ret.columns = columns
else:
@@ -860,11 +868,12 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs):
ret = Series(result, index=index)
if not self.as_index: # pragma: no cover
- print('Warning, ignoring as_index=True')
+ print("Warning, ignoring as_index=True")
# _level handled at higher
if not _level and isinstance(ret, dict):
from pandas import concat
+
ret = concat(ret, axis=1)
return ret
@@ -877,20 +886,21 @@ def _aggregate_multiple_funcs(self, arg, _level):
# have not shown a higher level one
# GH 15931
if isinstance(self._selected_obj, Series) and _level <= 1:
- msg = dedent("""\
+ msg = dedent(
+ """\
using a dict on a Series for aggregation
is deprecated and will be removed in a future version. Use \
named aggregation instead.
>>> grouper.agg(name_1=func_1, name_2=func_2)
- """)
+ """
+ )
warnings.warn(msg, FutureWarning, stacklevel=3)
columns = list(arg.keys())
arg = arg.items()
elif any(isinstance(x, (tuple, list)) for x in arg):
- arg = [(x, x) if not isinstance(x, (tuple, list)) else x
- for x in arg]
+ arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
# indicated column order
columns = next(zip(*arg))
@@ -907,8 +917,9 @@ def _aggregate_multiple_funcs(self, arg, _level):
obj = self
if name in results:
raise SpecificationError(
- 'Function names must be unique, found multiple named '
- '{}'.format(name))
+ "Function names must be unique, found multiple named "
+ "{}".format(name)
+ )
# reset the cache so that we
# only include the named selection
@@ -938,15 +949,13 @@ def _wrap_output(self, output, index, names=None):
return Series(output, index=index, name=name)
def _wrap_aggregated_output(self, output, names=None):
- result = self._wrap_output(output=output,
- index=self.grouper.result_index,
- names=names)
+ result = self._wrap_output(
+ output=output, index=self.grouper.result_index, names=names
+ )
return self._reindex_output(result)._convert(datetime=True)
def _wrap_transformed_output(self, output, names=None):
- return self._wrap_output(output=output,
- index=self.obj.index,
- names=names)
+ return self._wrap_output(output=output, index=self.obj.index, names=names)
def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(keys) == 0:
@@ -971,17 +980,13 @@ def _get_index():
return result
if isinstance(values[0], Series):
- return self._concat_objects(keys, values,
- not_indexed_same=not_indexed_same)
+ return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
elif isinstance(values[0], DataFrame):
# possible that Series -> DataFrame by applied function
- return self._concat_objects(keys, values,
- not_indexed_same=not_indexed_same)
+ return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
else:
# GH #6265 #24880
- result = Series(data=values,
- index=_get_index(),
- name=self._selection_name)
+ result = Series(data=values, index=_get_index(), name=self._selection_name)
return self._reindex_output(result)
def _aggregate_named(self, func, *args, **kwargs):
@@ -991,12 +996,12 @@ def _aggregate_named(self, func, *args, **kwargs):
group.name = name
output = func(group, *args, **kwargs)
if isinstance(output, (Series, Index, np.ndarray)):
- raise Exception('Must produce aggregated value')
+ raise Exception("Must produce aggregated value")
result[name] = self._try_cast(output, group)
return result
- @Substitution(klass='Series', selected='A.')
+ @Substitution(klass="Series", selected="A.")
@Appender(_transform_template)
def transform(self, func, *args, **kwargs):
func = self._is_cython_func(func) or func
@@ -1009,17 +1014,18 @@ def transform(self, func, *args, **kwargs):
else:
# cythonized aggregation and merge
return self._transform_fast(
- lambda: getattr(self, func)(*args, **kwargs), func)
+ lambda: getattr(self, func)(*args, **kwargs), func
+ )
# reg transform
klass = self._selected_obj.__class__
results = []
wrapper = lambda x: func(x, *args, **kwargs)
for name, group in self:
- object.__setattr__(group, 'name', name)
+ object.__setattr__(group, "name", name)
res = wrapper(group)
- if hasattr(res, 'values'):
+ if hasattr(res, "values"):
res = res.values
indexer = self._get_index(name)
@@ -1029,6 +1035,7 @@ def transform(self, func, *args, **kwargs):
# check for empty "results" to avoid concat ValueError
if results:
from pandas.core.reshape.concat import concat
+
result = concat(results).sort_index()
else:
result = Series()
@@ -1099,8 +1106,9 @@ def true_and_notna(x, *args, **kwargs):
return b and notna(b)
try:
- indices = [self._get_index(name) for name, group in self
- if true_and_notna(group)]
+ indices = [
+ self._get_index(name) for name, group in self if true_and_notna(group)
+ ]
except ValueError:
raise TypeError("the filter must return a boolean result")
except TypeError:
@@ -1125,7 +1133,7 @@ def nunique(self, dropna=True):
try:
sorter = np.lexsort((val, ids))
except TypeError: # catches object dtypes
- msg = 'val.dtype must be object, got {}'.format(val.dtype)
+ msg = "val.dtype must be object, got {}".format(val.dtype)
assert val.dtype == object, msg
val, _ = algorithms.factorize(val, sort=False)
sorter = np.lexsort((val, ids))
@@ -1149,7 +1157,7 @@ def nunique(self, dropna=True):
inc[mask & np.r_[False, mask[:-1]]] = 0
inc[idx] = 1
- out = np.add.reduceat(inc, idx).astype('int64', copy=False)
+ out = np.add.reduceat(inc, idx).astype("int64", copy=False)
if len(ids):
# NaN/NaT group exists if the head of ids is -1,
# so remove it from res and exclude its index from idx
@@ -1167,9 +1175,7 @@ def nunique(self, dropna=True):
res, out = np.zeros(len(ri), dtype=out.dtype), res
res[ids[idx]] = out
- return Series(res,
- index=ri,
- name=self._selection_name)
+ return Series(res, index=ri, name=self._selection_name)
@Appender(Series.describe.__doc__)
def describe(self, **kwargs):
@@ -1178,8 +1184,9 @@ def describe(self, **kwargs):
return result.T
return result.unstack()
- def value_counts(self, normalize=False, sort=True, ascending=False,
- bins=None, dropna=True):
+ def value_counts(
+ self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
+ ):
from pandas.core.reshape.tile import cut
from pandas.core.reshape.merge import _get_join_indexers
@@ -1187,11 +1194,13 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
if bins is not None and not np.iterable(bins):
# scalar bins cannot be done at top level
# in a backward compatible way
- return self.apply(Series.value_counts,
- normalize=normalize,
- sort=sort,
- ascending=ascending,
- bins=bins)
+ return self.apply(
+ Series.value_counts,
+ normalize=normalize,
+ sort=sort,
+ ascending=ascending,
+ bins=bins,
+ )
ids, _, _ = self.grouper.group_info
val = self.obj._internal_get_values()
@@ -1244,7 +1253,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
out, labels = out[mask], [label[mask] for label in labels]
if normalize:
- out = out.astype('float')
+ out = out.astype("float")
d = np.diff(np.r_[idx, len(ids)])
if dropna:
m = ids[lab == -1]
@@ -1260,8 +1269,9 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
out, labels[-1] = out[sorter], labels[-1][sorter]
if bins is None:
- mi = MultiIndex(levels=levels, codes=labels, names=names,
- verify_integrity=False)
+ mi = MultiIndex(
+ levels=levels, codes=labels, names=names, verify_integrity=False
+ )
if is_integer_dtype(out):
out = ensure_int64(out)
@@ -1269,18 +1279,17 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
# for compat. with libgroupby.value_counts need to ensure every
# bin is present at every index level, null filled with zeros
- diff = np.zeros(len(out), dtype='bool')
+ diff = np.zeros(len(out), dtype="bool")
for lab in labels[:-1]:
diff |= np.r_[True, lab[1:] != lab[:-1]]
ncat, nbin = diff.sum(), len(levels[-1])
- left = [np.repeat(np.arange(ncat), nbin),
- np.tile(np.arange(nbin), ncat)]
+ left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
right = [diff.cumsum() - 1, labels[-1]]
- _, idx = _get_join_indexers(left, right, sort=False, how='left')
+ _, idx = _get_join_indexers(left, right, sort=False, how="left")
out = np.where(idx != -1, out[idx], 0)
if sort:
@@ -1291,8 +1300,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
codes = list(map(lambda lab: np.repeat(lab[diff], nbin), labels[:-1]))
codes.append(left[-1])
- mi = MultiIndex(levels=levels, codes=codes, names=names,
- verify_integrity=False)
+ mi = MultiIndex(levels=levels, codes=codes, names=names, verify_integrity=False)
if is_integer_dtype(out):
out = ensure_int64(out)
@@ -1315,22 +1323,26 @@ def count(self):
minlength = ngroups or 0
out = np.bincount(ids[mask], minlength=minlength)
- return Series(out,
- index=self.grouper.result_index,
- name=self._selection_name,
- dtype='int64')
+ return Series(
+ out,
+ index=self.grouper.result_index,
+ name=self._selection_name,
+ dtype="int64",
+ )
def _apply_to_column_groupbys(self, func):
""" return a pass thru """
return func(self)
- def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None):
+ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None):
"""Calculate pct_change of each value to previous entry in group"""
# TODO: Remove this conditional when #23918 is fixed
if freq:
- return self.apply(lambda x: x.pct_change(periods=periods,
- fill_method=fill_method,
- limit=limit, freq=freq))
+ return self.apply(
+ lambda x: x.pct_change(
+ periods=periods, fill_method=fill_method, limit=limit, freq=freq
+ )
+ )
filled = getattr(self, fill_method)(limit=limit)
fill_grp = filled.groupby(self.grouper.labels)
shifted = fill_grp.shift(periods=periods, freq=freq)
@@ -1344,21 +1356,23 @@ class DataFrameGroupBy(NDFrameGroupBy):
#
# Make class defs of attributes on DataFrameGroupBy whitelist.
- for _def_str in whitelist_method_generator(
- GroupBy, DataFrame, _apply_whitelist):
+ for _def_str in whitelist_method_generator(GroupBy, DataFrame, _apply_whitelist):
exec(_def_str)
_block_agg_axis = 1
- _agg_see_also_doc = dedent("""
+ _agg_see_also_doc = dedent(
+ """
See Also
--------
pandas.DataFrame.groupby.apply
pandas.DataFrame.groupby.transform
pandas.DataFrame.aggregate
- """)
+ """
+ )
- _agg_examples_doc = dedent("""
+ _agg_examples_doc = dedent(
+ """
Examples
--------
@@ -1426,14 +1440,17 @@ class DataFrameGroupBy(NDFrameGroupBy):
As usual, the aggregation can be a callable or a string alias.
See :ref:`groupby.aggregate.named` for more.
- """)
-
- @Substitution(see_also=_agg_see_also_doc,
- examples=_agg_examples_doc,
- versionadded='',
- klass='DataFrame',
- axis='')
- @Appender(_shared_docs['aggregate'])
+ """
+ )
+
+ @Substitution(
+ see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded="",
+ klass="DataFrame",
+ axis="",
+ )
+ @Appender(_shared_docs["aggregate"])
def aggregate(self, arg=None, *args, **kwargs):
return super().aggregate(arg, *args, **kwargs)
@@ -1456,17 +1473,21 @@ def _gotitem(self, key, ndim, subset=None):
if ndim == 2:
if subset is None:
subset = self.obj
- return DataFrameGroupBy(subset, self.grouper, selection=key,
- grouper=self.grouper,
- exclusions=self.exclusions,
- as_index=self.as_index,
- observed=self.observed)
+ return DataFrameGroupBy(
+ subset,
+ self.grouper,
+ selection=key,
+ grouper=self.grouper,
+ exclusions=self.exclusions,
+ as_index=self.as_index,
+ observed=self.observed,
+ )
elif ndim == 1:
if subset is None:
subset = self.obj[key]
- return SeriesGroupBy(subset, selection=key,
- grouper=self.grouper,
- observed=self.observed)
+ return SeriesGroupBy(
+ subset, selection=key, grouper=self.grouper, observed=self.observed
+ )
raise AssertionError("invalid ndim for _gotitem")
@@ -1474,11 +1495,9 @@ def _wrap_generic_output(self, result, obj):
result_index = self.grouper.levels[0]
if self.axis == 0:
- return DataFrame(result, index=obj.columns,
- columns=result_index).T
+ return DataFrame(result, index=obj.columns, columns=result_index).T
else:
- return DataFrame(result, index=obj.index,
- columns=result_index)
+ return DataFrame(result, index=obj.index, columns=result_index)
def _get_data_to_aggregate(self):
obj = self._obj_with_exclusions
@@ -1489,10 +1508,16 @@ def _get_data_to_aggregate(self):
def _insert_inaxis_grouper_inplace(self, result):
# zip in reverse so we can always insert at loc 0
- izip = zip(* map(reversed, (
- self.grouper.names,
- self.grouper.get_group_levels(),
- [grp.in_axis for grp in self.grouper.groupings])))
+ izip = zip(
+ *map(
+ reversed,
+ (
+ self.grouper.names,
+ self.grouper.get_group_levels(),
+ [grp.in_axis for grp in self.grouper.groupings],
+ ),
+ )
+ )
for name, lev, in_axis in izip:
if in_axis:
@@ -1540,17 +1565,21 @@ def _wrap_agged_blocks(self, items, blocks):
def _iterate_column_groupbys(self):
for i, colname in enumerate(self._selected_obj.columns):
- yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i],
- selection=colname,
- grouper=self.grouper,
- exclusions=self.exclusions)
+ yield colname, SeriesGroupBy(
+ self._selected_obj.iloc[:, i],
+ selection=colname,
+ grouper=self.grouper,
+ exclusions=self.exclusions,
+ )
def _apply_to_column_groupbys(self, func):
from pandas.core.reshape.concat import concat
+
return concat(
- (func(col_groupby) for _, col_groupby
- in self._iterate_column_groupbys()),
- keys=self._selected_obj.columns, axis=1)
+ (func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()),
+ keys=self._selected_obj.columns,
+ axis=1,
+ )
def count(self):
"""
@@ -1567,12 +1596,10 @@ def count(self):
ids, _, ngroups = self.grouper.group_info
mask = ids != -1
- val = ((mask & ~_isna(np.atleast_2d(blk.get_values())))
- for blk in data.blocks)
+ val = ((mask & ~_isna(np.atleast_2d(blk.get_values()))) for blk in data.blocks)
loc = (blk.mgr_locs for blk in data.blocks)
- counter = partial(
- lib.count_level_2d, labels=ids, max_bin=ngroups, axis=1)
+ counter = partial(lib.count_level_2d, labels=ids, max_bin=ngroups, axis=1)
blk = map(make_block, map(counter, val), loc)
return self._wrap_agged_blocks(data.items, list(blk))
@@ -1628,14 +1655,15 @@ def nunique(self, dropna=True):
obj = self._selected_obj
def groupby_series(obj, col=None):
- return SeriesGroupBy(obj,
- selection=col,
- grouper=self.grouper).nunique(dropna=dropna)
+ return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique(
+ dropna=dropna
+ )
if isinstance(obj, Series):
results = groupby_series(obj)
else:
from pandas.core.reshape.concat import concat
+
results = [groupby_series(obj[col], col) for col in obj.columns]
results = concat(results, axis=1)
results.columns.names = obj.columns.names
@@ -1669,10 +1697,7 @@ def _is_multi_agg_with_relabel(**kwargs):
>>> _is_multi_agg_with_relabel()
False
"""
- return all(
- isinstance(v, tuple) and len(v) == 2
- for v in kwargs.values()
- ) and kwargs
+ return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and kwargs
def _normalize_keyword_aggregation(kwargs):
@@ -1719,8 +1744,7 @@ def _normalize_keyword_aggregation(kwargs):
aggspec[column].append(aggfunc)
else:
aggspec[column] = [aggfunc]
- order.append((column,
- com.get_callable_name(aggfunc) or aggfunc))
+ order.append((column, com.get_callable_name(aggfunc) or aggfunc))
return aggspec, columns, order
@@ -1730,6 +1754,7 @@ def _normalize_keyword_aggregation(kwargs):
# typing.Sequence[Callable[..., ScalarResult]]
# -> typing.Sequence[Callable[..., ScalarResult]]:
+
def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]:
"""
Possibly mangle a list of aggfuncs.
@@ -1756,7 +1781,7 @@ def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]:
for aggfunc in aggfuncs:
if com.get_callable_name(aggfunc) == "":
aggfunc = functools.partial(aggfunc)
- aggfunc.__name__ = ''.format(i)
+ aggfunc.__name__ = "".format(i)
i += 1
mangled_aggfuncs.append(aggfunc)
@@ -1828,13 +1853,15 @@ def _recast_datetimelike_result(result: DataFrame) -> DataFrame:
"""
result = result.copy()
- obj_cols = [idx for idx in range(len(result.columns))
- if is_object_dtype(result.dtypes[idx])]
+ obj_cols = [
+ idx for idx in range(len(result.columns)) if is_object_dtype(result.dtypes[idx])
+ ]
# See GH#26285
for n in obj_cols:
- converted = maybe_convert_objects(result.iloc[:, n].values,
- convert_numeric=False)
+ converted = maybe_convert_objects(
+ result.iloc[:, n].values, convert_numeric=False
+ )
result.iloc[:, n] = converted
return result
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 925f006de92b6..aa71fd68086fb 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -28,16 +28,24 @@ class providing the base-class of operations.
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import (
- ensure_float, is_datetime64tz_dtype, is_extension_array_dtype,
- is_numeric_dtype, is_scalar)
+ ensure_float,
+ is_datetime64tz_dtype,
+ is_extension_array_dtype,
+ is_numeric_dtype,
+ is_scalar,
+)
from pandas.core.dtypes.missing import isna, notna
-from pandas.api.types import (
- is_datetime64_dtype, is_integer_dtype, is_object_dtype)
+from pandas.api.types import is_datetime64_dtype, is_integer_dtype, is_object_dtype
import pandas.core.algorithms as algorithms
from pandas.core.arrays import Categorical
from pandas.core.base import (
- DataError, GroupByError, PandasObject, SelectionMixin, SpecificationError)
+ DataError,
+ GroupByError,
+ PandasObject,
+ SelectionMixin,
+ SpecificationError,
+)
import pandas.core.common as com
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
@@ -169,7 +177,8 @@ class providing the base-class of operations.
Examples
--------
{examples}
- """)
+ """,
+)
_pipe_template = """
Apply a function `func` with arguments to this %(klass)s object and return
@@ -303,14 +312,17 @@ def __init__(self, groupby):
def __call__(self, *args, **kwargs):
def f(self):
return self.plot(*args, **kwargs)
- f.__name__ = 'plot'
+
+ f.__name__ = "plot"
return self._groupby.apply(f)
def __getattr__(self, name):
def attr(*args, **kwargs):
def f(self):
return getattr(self.plot, name)(*args, **kwargs)
+
return self._groupby.apply(f)
+
return attr
@@ -328,10 +340,22 @@ class _GroupBy(PandasObject, SelectionMixin):
_group_selection = None
_apply_whitelist = frozenset() # type: FrozenSet[str]
- def __init__(self, obj, keys=None, axis=0, level=None,
- grouper=None, exclusions=None, selection=None, as_index=True,
- sort=True, group_keys=True, squeeze=False,
- observed=False, **kwargs):
+ def __init__(
+ self,
+ obj,
+ keys=None,
+ axis=0,
+ level=None,
+ grouper=None,
+ exclusions=None,
+ selection=None,
+ as_index=True,
+ sort=True,
+ group_keys=True,
+ squeeze=False,
+ observed=False,
+ **kwargs
+ ):
self._selection = selection
@@ -342,9 +366,9 @@ def __init__(self, obj, keys=None, axis=0, level=None,
if not as_index:
if not isinstance(obj, DataFrame):
- raise TypeError('as_index=False only valid with DataFrame')
+ raise TypeError("as_index=False only valid with DataFrame")
if axis != 0:
- raise ValueError('as_index=False only valid for axis=0')
+ raise ValueError("as_index=False only valid for axis=0")
self.as_index = as_index
self.keys = keys
@@ -352,16 +376,20 @@ def __init__(self, obj, keys=None, axis=0, level=None,
self.group_keys = group_keys
self.squeeze = squeeze
self.observed = observed
- self.mutated = kwargs.pop('mutated', False)
+ self.mutated = kwargs.pop("mutated", False)
if grouper is None:
from pandas.core.groupby.grouper import _get_grouper
- grouper, exclusions, obj = _get_grouper(obj, keys,
- axis=axis,
- level=level,
- sort=sort,
- observed=observed,
- mutated=self.mutated)
+
+ grouper, exclusions, obj = _get_grouper(
+ obj,
+ keys,
+ axis=axis,
+ level=level,
+ sort=sort,
+ observed=observed,
+ mutated=self.mutated,
+ )
self.obj = obj
self.axis = obj._get_axis_number(axis)
@@ -369,7 +397,7 @@ def __init__(self, obj, keys=None, axis=0, level=None,
self.exclusions = set(exclusions) if exclusions else set()
# we accept no other args
- validate_kwargs('group', kwargs, {})
+ validate_kwargs("group", kwargs, {})
def __len__(self):
return len(self.groups)
@@ -428,13 +456,12 @@ def get_converter(s):
if len(self.indices) > 0:
index_sample = next(iter(self.indices))
else:
- index_sample = None # Dummy sample
+ index_sample = None # Dummy sample
name_sample = names[0]
if isinstance(index_sample, tuple):
if not isinstance(name_sample, tuple):
- msg = ("must supply a tuple to get_group with multiple"
- " grouping keys")
+ msg = "must supply a tuple to get_group with multiple" " grouping keys"
raise ValueError(msg)
if not len(name_sample) == len(index_sample):
try:
@@ -442,13 +469,14 @@ def get_converter(s):
return [self.indices[name] for name in names]
except KeyError:
# turns out it wasn't a tuple
- msg = ("must supply a same-length tuple to get_group"
- " with multiple grouping keys")
+ msg = (
+ "must supply a same-length tuple to get_group"
+ " with multiple grouping keys"
+ )
raise ValueError(msg)
converters = [get_converter(s) for s in index_sample]
- names = (tuple(f(n) for f, n in zip(converters, name))
- for name in names)
+ names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)
else:
converter = get_converter(index_sample)
@@ -482,7 +510,7 @@ def _reset_group_selection(self):
if self._group_selection is not None:
# GH12839 clear cached selection too when changing group selection
self._group_selection = None
- self._reset_cache('_selected_obj')
+ self._reset_cache("_selected_obj")
def _set_group_selection(self):
"""
@@ -493,21 +521,21 @@ def _set_group_selection(self):
NOTE: this should be paired with a call to _reset_group_selection
"""
grp = self.grouper
- if not (self.as_index and
- getattr(grp, 'groupings', None) is not None and
- self.obj.ndim > 1 and
- self._group_selection is None):
+ if not (
+ self.as_index
+ and getattr(grp, "groupings", None) is not None
+ and self.obj.ndim > 1
+ and self._group_selection is None
+ ):
return
ax = self.obj._info_axis
- groupers = [g.name for g in grp.groupings
- if g.level is None and g.in_axis]
+ groupers = [g.name for g in grp.groupings if g.level is None and g.in_axis]
if len(groupers):
# GH12839 clear selected obj cache when group selection changes
- self._group_selection = ax.difference(Index(groupers),
- sort=False).tolist()
- self._reset_cache('_selected_obj')
+ self._group_selection = ax.difference(Index(groupers), sort=False).tolist()
+ self._reset_cache("_selected_obj")
def _set_result_index_ordered(self, result):
# set the result index on the passed values object and
@@ -516,13 +544,11 @@ def _set_result_index_ordered(self, result):
# the values/counts are repeated according to the group index
# shortcut if we have an already ordered grouper
if not self.grouper.is_monotonic:
- index = Index(np.concatenate(
- self._get_indices(self.grouper.result_index)))
+ index = Index(np.concatenate(self._get_indices(self.grouper.result_index)))
result.set_axis(index, axis=self.axis, inplace=True)
result = result.sort_index(axis=self.axis)
- result.set_axis(self.obj._get_axis(self.axis), axis=self.axis,
- inplace=True)
+ result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
return result
def _dir_additions(self):
@@ -536,12 +562,14 @@ def __getattr__(self, attr):
if hasattr(self.obj, attr):
return self._make_wrapper(attr)
- raise AttributeError("%r object has no attribute %r" %
- (type(self).__name__, attr))
+ raise AttributeError(
+ "%r object has no attribute %r" % (type(self).__name__, attr)
+ )
- @Substitution(klass='GroupBy',
- versionadded='.. versionadded:: 0.21.0',
- examples="""\
+ @Substitution(
+ klass="GroupBy",
+ versionadded=".. versionadded:: 0.21.0",
+ examples="""\
>>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
>>> df
A B
@@ -557,7 +585,8 @@ def __getattr__(self, attr):
B
A
a 2
-b 2""")
+b 2""",
+ )
@Appender(_pipe_template)
def pipe(self, func, *args, **kwargs):
return com._pipe(self, func, *args, **kwargs)
@@ -567,10 +596,11 @@ def pipe(self, func, *args, **kwargs):
def _make_wrapper(self, name):
if name not in self._apply_whitelist:
is_callable = callable(getattr(self._selected_obj, name, None))
- kind = ' callable ' if is_callable else ' '
- msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try "
- "using the 'apply' method".format(kind, name,
- type(self).__name__))
+ kind = " callable " if is_callable else " "
+ msg = (
+ "Cannot access{0}attribute {1!r} of {2!r} objects, try "
+ "using the 'apply' method".format(kind, name, type(self).__name__)
+ )
raise AttributeError(msg)
self._set_group_selection()
@@ -587,9 +617,8 @@ def wrapper(*args, **kwargs):
# a little trickery for aggregation functions that need an axis
# argument
kwargs_with_axis = kwargs.copy()
- if ('axis' not in kwargs_with_axis or
- kwargs_with_axis['axis'] is None):
- kwargs_with_axis['axis'] = self.axis
+ if "axis" not in kwargs_with_axis or kwargs_with_axis["axis"] is None:
+ kwargs_with_axis["axis"] = self.axis
def curried_with_axis(x):
return f(x, *args, **kwargs_with_axis)
@@ -620,8 +649,7 @@ def curried(x):
# if we don't have this method to indicated to aggregate to
# mark this column as an error
try:
- return self._aggregate_item_by_item(name,
- *args, **kwargs)
+ return self._aggregate_item_by_item(name, *args, **kwargs)
except (AttributeError):
raise ValueError
@@ -664,9 +692,11 @@ def __iter__(self):
"""
return self.grouper.get_iterator(self.obj, axis=self.axis)
- @Appender(_apply_docs['template']
- .format(input="dataframe",
- examples=_apply_docs['dataframe_examples']))
+ @Appender(
+ _apply_docs["template"].format(
+ input="dataframe", examples=_apply_docs["dataframe_examples"]
+ )
+ )
def apply(self, func, *args, **kwargs):
func = self._is_builtin_func(func)
@@ -679,16 +709,18 @@ def apply(self, func, *args, **kwargs):
@wraps(func)
def f(g):
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
return func(g, *args, **kwargs)
+
else:
- raise ValueError('func must be a callable if args or '
- 'kwargs are supplied')
+ raise ValueError(
+ "func must be a callable if args or " "kwargs are supplied"
+ )
else:
f = func
# ignore SettingWithCopy here in case the user mutates
- with option_context('mode.chained_assignment', None):
+ with option_context("mode.chained_assignment", None):
try:
result = self._python_apply_general(f)
except Exception:
@@ -707,13 +739,11 @@ def f(g):
return result
def _python_apply_general(self, f):
- keys, values, mutated = self.grouper.apply(f, self._selected_obj,
- self.axis)
+ keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
return self._wrap_applied_output(
- keys,
- values,
- not_indexed_same=mutated or self.mutated)
+ keys, values, not_indexed_same=mutated or self.mutated
+ )
def _iterate_slices(self):
yield self._selection_name, self._selected_obj
@@ -775,7 +805,7 @@ def _try_cast(self, result, obj, numeric_only=False):
# to the target timezone
try:
result = obj._values._from_sequence(
- result, dtype='datetime64[ns, UTC]'
+ result, dtype="datetime64[ns, UTC]"
)
result = result.astype(dtype)
except TypeError:
@@ -813,7 +843,8 @@ def _transform_should_cast(self, func_nm):
Whether transform should attempt to cast the result of aggregation
"""
return (self.size().fillna(0) > 0).any() and (
- func_nm not in base.cython_cast_blacklist)
+ func_nm not in base.cython_cast_blacklist
+ )
def _cython_transform(self, how, numeric_only=True, **kwargs):
output = collections.OrderedDict()
@@ -823,8 +854,7 @@ def _cython_transform(self, how, numeric_only=True, **kwargs):
continue
try:
- result, names = self.grouper.transform(obj.values, how,
- **kwargs)
+ result, names = self.grouper.transform(obj.values, how, **kwargs)
except NotImplementedError:
continue
except AssertionError as e:
@@ -835,12 +865,11 @@ def _cython_transform(self, how, numeric_only=True, **kwargs):
output[name] = result
if len(output) == 0:
- raise DataError('No numeric types to aggregate')
+ raise DataError("No numeric types to aggregate")
return self._wrap_transformed_output(output, names)
- def _cython_agg_general(self, how, alt=None, numeric_only=True,
- min_count=-1):
+ def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1):
output = {}
for name, obj in self._iterate_slices():
is_numeric = is_numeric_dtype(obj.dtype)
@@ -848,14 +877,15 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True,
continue
try:
- result, names = self.grouper.aggregate(obj.values, how,
- min_count=min_count)
+ result, names = self.grouper.aggregate(
+ obj.values, how, min_count=min_count
+ )
except AssertionError as e:
raise GroupByError(str(e))
output[name] = self._try_cast(result, obj)
if len(output) == 0:
- raise DataError('No numeric types to aggregate')
+ raise DataError("No numeric types to aggregate")
return self._wrap_aggregated_output(output, names)
@@ -918,7 +948,8 @@ def reset_identity(values):
# GH 14776
if isinstance(ax, MultiIndex) and not ax.is_unique:
indexer = algorithms.unique1d(
- result.index.get_indexer_for(ax.values))
+ result.index.get_indexer_for(ax.values)
+ )
result = result.take(indexer, axis=self.axis)
else:
result = result.reindex(ax, axis=self.axis)
@@ -933,9 +964,14 @@ def reset_identity(values):
group_levels = self.grouper.levels
group_names = self.grouper.names
- result = concat(values, axis=self.axis, keys=group_keys,
- levels=group_levels, names=group_names,
- sort=False)
+ result = concat(
+ values,
+ axis=self.axis,
+ keys=group_keys,
+ levels=group_levels,
+ names=group_names,
+ sort=False,
+ )
else:
# GH5610, returns a MI, with the first level being a
@@ -946,8 +982,10 @@ def reset_identity(values):
values = reset_identity(values)
result = concat(values, axis=self.axis)
- if (isinstance(result, Series) and
- getattr(self, '_selection_name', None) is not None):
+ if (
+ isinstance(result, Series)
+ and getattr(self, "_selection_name", None) is not None
+ ):
result.name = self._selection_name
@@ -955,7 +993,7 @@ def reset_identity(values):
def _apply_filter(self, indices, dropna):
if len(indices) == 0:
- indices = np.array([], dtype='int64')
+ indices = np.array([], dtype="int64")
else:
indices = np.sort(np.concatenate(indices))
if dropna:
@@ -1038,6 +1076,7 @@ class GroupBy(_GroupBy):
See the online documentation for full exposition on these topics and much
more
"""
+
def _bool_agg(self, val_test, skipna):
"""
Shared func to call any / all Cython GroupBy implementations.
@@ -1054,16 +1093,20 @@ def objs_to_bool(vals: np.ndarray) -> Tuple[np.ndarray, Type]:
def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray:
return result.astype(inference, copy=False)
- return self._get_cythonized_result('group_any_all', self.grouper,
- aggregate=True,
- cython_dtype=np.uint8,
- needs_values=True,
- needs_mask=True,
- pre_processing=objs_to_bool,
- post_processing=result_to_bool,
- val_test=val_test, skipna=skipna)
-
- @Substitution(name='groupby')
+ return self._get_cythonized_result(
+ "group_any_all",
+ self.grouper,
+ aggregate=True,
+ cython_dtype=np.uint8,
+ needs_values=True,
+ needs_mask=True,
+ pre_processing=objs_to_bool,
+ post_processing=result_to_bool,
+ val_test=val_test,
+ skipna=skipna,
+ )
+
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def any(self, skipna=True):
"""
@@ -1078,9 +1121,9 @@ def any(self, skipna=True):
-------
bool
"""
- return self._bool_agg('any', skipna)
+ return self._bool_agg("any", skipna)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def all(self, skipna=True):
"""
@@ -1095,9 +1138,9 @@ def all(self, skipna=True):
-------
bool
"""
- return self._bool_agg('all', skipna)
+ return self._bool_agg("all", skipna)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def count(self):
"""
@@ -1112,7 +1155,7 @@ def count(self):
# defined here for API doc
raise NotImplementedError
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Substitution(see_also=_common_see_also)
def mean(self, *args, **kwargs):
"""
@@ -1156,10 +1199,11 @@ def mean(self, *args, **kwargs):
2 4.0
Name: B, dtype: float64
"""
- nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])
+ nv.validate_groupby_func("mean", args, kwargs, ["numeric_only"])
try:
return self._cython_agg_general(
- 'mean', alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs)
+ "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs
+ )
except GroupByError:
raise
except Exception: # pragma: no cover
@@ -1167,7 +1211,7 @@ def mean(self, *args, **kwargs):
f = lambda x: x.mean(axis=self.axis, **kwargs)
return self._python_agg_general(f)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def median(self, **kwargs):
"""
@@ -1182,10 +1226,10 @@ def median(self, **kwargs):
"""
try:
return self._cython_agg_general(
- 'median',
- alt=lambda x,
- axis: Series(x).median(axis=axis, **kwargs),
- **kwargs)
+ "median",
+ alt=lambda x, axis: Series(x).median(axis=axis, **kwargs),
+ **kwargs
+ )
except GroupByError:
raise
except Exception: # pragma: no cover
@@ -1194,10 +1238,11 @@ def f(x):
if isinstance(x, np.ndarray):
x = Series(x)
return x.median(axis=self.axis, **kwargs)
+
with _group_selection_context(self):
return self._python_agg_general(f)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def std(self, ddof=1, *args, **kwargs):
"""
@@ -1217,10 +1262,10 @@ def std(self, ddof=1, *args, **kwargs):
"""
# TODO: implement at Cython level?
- nv.validate_groupby_func('std', args, kwargs)
+ nv.validate_groupby_func("std", args, kwargs)
return np.sqrt(self.var(ddof=ddof, **kwargs))
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def var(self, ddof=1, *args, **kwargs):
"""
@@ -1238,13 +1283,14 @@ def var(self, ddof=1, *args, **kwargs):
Series or DataFrame
Variance of values within each group.
"""
- nv.validate_groupby_func('var', args, kwargs)
+ nv.validate_groupby_func("var", args, kwargs)
if ddof == 1:
try:
return self._cython_agg_general(
- 'var',
+ "var",
alt=lambda x, axis: Series(x).var(ddof=ddof, **kwargs),
- **kwargs)
+ **kwargs
+ )
except Exception:
f = lambda x: x.var(ddof=ddof, **kwargs)
with _group_selection_context(self):
@@ -1254,7 +1300,7 @@ def var(self, ddof=1, *args, **kwargs):
with _group_selection_context(self):
return self._python_agg_general(f)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def sem(self, ddof=1):
"""
@@ -1274,7 +1320,7 @@ def sem(self, ddof=1):
"""
return self.std(ddof=ddof) / np.sqrt(self.count())
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def size(self):
"""
@@ -1288,7 +1334,7 @@ def size(self):
result = self.grouper.size()
if isinstance(self.obj, Series):
- result.name = getattr(self.obj, 'name', None)
+ result.name = getattr(self.obj, "name", None)
return result
@classmethod
@@ -1297,9 +1343,7 @@ def _add_numeric_operations(cls):
Add numeric operations to the GroupBy generically.
"""
- def groupby_function(name, alias, npfunc,
- numeric_only=True,
- min_count=-1):
+ def groupby_function(name, alias, npfunc, numeric_only=True, min_count=-1):
_local_template = """
Compute %(f)s of group values.
@@ -1310,38 +1354,34 @@ def groupby_function(name, alias, npfunc,
Computed %(f)s of values within each group.
"""
- @Substitution(name='groupby', f=name)
+ @Substitution(name="groupby", f=name)
@Appender(_common_see_also)
@Appender(_local_template)
def f(self, **kwargs):
- if 'numeric_only' not in kwargs:
- kwargs['numeric_only'] = numeric_only
- if 'min_count' not in kwargs:
- kwargs['min_count'] = min_count
+ if "numeric_only" not in kwargs:
+ kwargs["numeric_only"] = numeric_only
+ if "min_count" not in kwargs:
+ kwargs["min_count"] = min_count
self._set_group_selection()
# try a cython aggregation if we can
try:
- return self._cython_agg_general(
- alias, alt=npfunc, **kwargs)
+ return self._cython_agg_general(alias, alt=npfunc, **kwargs)
except AssertionError as e:
raise SpecificationError(str(e))
except Exception:
pass
# apply a non-cython aggregation
- result = self.aggregate(
- lambda x: npfunc(x, axis=self.axis))
+ result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
# coerce the resulting columns if we can
if isinstance(result, DataFrame):
for col in result.columns:
- result[col] = self._try_cast(
- result[col], self.obj[col])
+ result[col] = self._try_cast(result[col], self.obj[col])
else:
- result = self._try_cast(
- result, self.obj)
+ result = self._try_cast(result, self.obj)
return result
@@ -1350,7 +1390,6 @@ def f(self, **kwargs):
return f
def first_compat(x, axis=0):
-
def first(x):
x = x.to_numpy()
@@ -1365,7 +1404,6 @@ def first(x):
return first(x)
def last_compat(x, axis=0):
-
def last(x):
x = x.to_numpy()
x = x[notna(x)]
@@ -1378,16 +1416,14 @@ def last(x):
else:
return last(x)
- cls.sum = groupby_function('sum', 'add', np.sum, min_count=0)
- cls.prod = groupby_function('prod', 'prod', np.prod, min_count=0)
- cls.min = groupby_function('min', 'min', np.min, numeric_only=False)
- cls.max = groupby_function('max', 'max', np.max, numeric_only=False)
- cls.first = groupby_function('first', 'first', first_compat,
- numeric_only=False)
- cls.last = groupby_function('last', 'last', last_compat,
- numeric_only=False)
+ cls.sum = groupby_function("sum", "add", np.sum, min_count=0)
+ cls.prod = groupby_function("prod", "prod", np.prod, min_count=0)
+ cls.min = groupby_function("min", "min", np.min, numeric_only=False)
+ cls.max = groupby_function("max", "max", np.max, numeric_only=False)
+ cls.first = groupby_function("first", "first", first_compat, numeric_only=False)
+ cls.last = groupby_function("last", "last", last_compat, numeric_only=False)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def ohlc(self):
"""
@@ -1401,8 +1437,7 @@ def ohlc(self):
Open, high, low and close values within each group.
"""
- return self._apply_to_column_groupbys(
- lambda x: x._cython_agg_general('ohlc'))
+ return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc"))
@Appender(DataFrame.describe.__doc__)
def describe(self, **kwargs):
@@ -1519,18 +1554,20 @@ def resample(self, rule, *args, **kwargs):
5 2000-01-01 00:00:20 5 1
"""
from pandas.core.resample import get_resampler_for_grouping
+
return get_resampler_for_grouping(self, rule, *args, **kwargs)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def rolling(self, *args, **kwargs):
"""
Return a rolling grouper, providing rolling functionality per group.
"""
from pandas.core.window import RollingGroupby
+
return RollingGroupby(self, *args, **kwargs)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def expanding(self, *args, **kwargs):
"""
@@ -1538,6 +1575,7 @@ def expanding(self, *args, **kwargs):
functionality per group.
"""
from pandas.core.window import ExpandingGroupby
+
return ExpandingGroupby(self, *args, **kwargs)
def _fill(self, direction, limit=None):
@@ -1567,13 +1605,17 @@ def _fill(self, direction, limit=None):
if limit is None:
limit = -1
- return self._get_cythonized_result('group_fillna_indexer',
- self.grouper, needs_mask=True,
- cython_dtype=np.int64,
- result_is_index=True,
- direction=direction, limit=limit)
-
- @Substitution(name='groupby')
+ return self._get_cythonized_result(
+ "group_fillna_indexer",
+ self.grouper,
+ needs_mask=True,
+ cython_dtype=np.int64,
+ result_is_index=True,
+ direction=direction,
+ limit=limit,
+ )
+
+ @Substitution(name="groupby")
def pad(self, limit=None):
"""
Forward fill the values.
@@ -1595,10 +1637,11 @@ def pad(self, limit=None):
Series.fillna
DataFrame.fillna
"""
- return self._fill('ffill', limit=limit)
+ return self._fill("ffill", limit=limit)
+
ffill = pad
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
def backfill(self, limit=None):
"""
Backward fill the values.
@@ -1620,14 +1663,13 @@ def backfill(self, limit=None):
Series.fillna
DataFrame.fillna
"""
- return self._fill('bfill', limit=limit)
+ return self._fill("bfill", limit=limit)
+
bfill = backfill
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Substitution(see_also=_common_see_also)
- def nth(self,
- n: Union[int, List[int]],
- dropna: Optional[str] = None) -> DataFrame:
+ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame:
"""
Take the nth row from each group if n is an int, or a subset of rows
if n is a list of ints.
@@ -1717,8 +1759,7 @@ def nth(self,
self._set_group_selection()
mask_left = np.in1d(self._cumcount_array(), nth_array)
- mask_right = np.in1d(self._cumcount_array(ascending=False) + 1,
- -nth_array)
+ mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array)
mask = mask_left | mask_right
ids, _, _ = self.grouper.group_info
@@ -1736,19 +1777,19 @@ def nth(self,
# dropna is truthy
if isinstance(n, valid_containers):
- raise ValueError(
- "dropna option with a list of nth values is not supported")
+ raise ValueError("dropna option with a list of nth values is not supported")
- if dropna not in ['any', 'all']:
+ if dropna not in ["any", "all"]:
# Note: when agg-ing picker doesn't raise this, just returns NaN
- raise ValueError("For a DataFrame groupby, dropna must be "
- "either None, 'any' or 'all', "
- "(was passed {dropna}).".format(
- dropna=dropna))
+ raise ValueError(
+ "For a DataFrame groupby, dropna must be "
+ "either None, 'any' or 'all', "
+ "(was passed {dropna}).".format(dropna=dropna)
+ )
# old behaviour, but with all and any support for DataFrames.
# modified in GH 7559 to have better perf
- max_len = n if n >= 0 else - 1 - n
+ max_len = n if n >= 0 else -1 - n
dropped = self.obj.dropna(how=dropna, axis=self.axis)
# get a new grouper for our dropped obj
@@ -1765,13 +1806,17 @@ def nth(self,
# create a grouper with the original parameters, but on dropped
# object
from pandas.core.groupby.grouper import _get_grouper
- grouper, _, _ = _get_grouper(dropped, key=self.keys,
- axis=self.axis, level=self.level,
- sort=self.sort,
- mutated=self.mutated)
- grb = dropped.groupby(
- grouper, as_index=self.as_index, sort=self.sort)
+ grouper, _, _ = _get_grouper(
+ dropped,
+ key=self.keys,
+ axis=self.axis,
+ level=self.level,
+ sort=self.sort,
+ mutated=self.mutated,
+ )
+
+ grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
sizes, result = grb.size(), grb.nth(n)
mask = (sizes < max_len).values
@@ -1780,15 +1825,16 @@ def nth(self,
result.loc[mask] = np.nan
# reset/reindex to the original groups
- if (len(self.obj) == len(dropped) or
- len(result) == len(self.grouper.result_index)):
+ if len(self.obj) == len(dropped) or len(result) == len(
+ self.grouper.result_index
+ ):
result.index = self.grouper.result_index
else:
result = result.reindex(self.grouper.result_index)
return result
- def quantile(self, q=0.5, interpolation='linear'):
+ def quantile(self, q=0.5, interpolation="linear"):
"""
Return group values at the given quantile, a la numpy.percentile.
@@ -1823,44 +1869,46 @@ def quantile(self, q=0.5, interpolation='linear'):
b 3.0
"""
- def pre_processor(
- vals: np.ndarray
- ) -> Tuple[np.ndarray, Optional[Type]]:
+ def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]:
if is_object_dtype(vals):
- raise TypeError("'quantile' cannot be performed against "
- "'object' dtypes!")
+ raise TypeError(
+ "'quantile' cannot be performed against " "'object' dtypes!"
+ )
inference = None
if is_integer_dtype(vals):
inference = np.int64
elif is_datetime64_dtype(vals):
- inference = 'datetime64[ns]'
+ inference = "datetime64[ns]"
vals = vals.astype(np.float)
return vals, inference
- def post_processor(
- vals: np.ndarray,
- inference: Optional[Type]
- ) -> np.ndarray:
+ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
if inference:
# Check for edge case
- if not (is_integer_dtype(inference) and
- interpolation in {'linear', 'midpoint'}):
+ if not (
+ is_integer_dtype(inference)
+ and interpolation in {"linear", "midpoint"}
+ ):
vals = vals.astype(inference)
return vals
- return self._get_cythonized_result('group_quantile', self.grouper,
- aggregate=True,
- needs_values=True,
- needs_mask=True,
- cython_dtype=np.float64,
- pre_processing=pre_processor,
- post_processing=post_processor,
- q=q, interpolation=interpolation)
-
- @Substitution(name='groupby')
+ return self._get_cythonized_result(
+ "group_quantile",
+ self.grouper,
+ aggregate=True,
+ needs_values=True,
+ needs_mask=True,
+ cython_dtype=np.float64,
+ pre_processing=pre_processor,
+ post_processing=post_processor,
+ q=q,
+ interpolation=interpolation,
+ )
+
+ @Substitution(name="groupby")
def ngroup(self, ascending=True):
"""
Number each group from 0 to the number of groups - 1.
@@ -1931,7 +1979,7 @@ def ngroup(self, ascending=True):
result = self.ngroups - 1 - result
return result
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
def cumcount(self, ascending=True):
"""
Number each item in each group from 0 to the length of that group - 1.
@@ -1990,10 +2038,11 @@ def cumcount(self, ascending=True):
cumcounts = self._cumcount_array(ascending=ascending)
return Series(cumcounts, index)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
- def rank(self, method='average', ascending=True, na_option='keep',
- pct=False, axis=0):
+ def rank(
+ self, method="average", ascending=True, na_option="keep", pct=False, axis=0
+ ):
"""
Provide the rank of values within each group.
@@ -2020,14 +2069,20 @@ def rank(self, method='average', ascending=True, na_option='keep',
-------
DataFrame with ranking of values within each group
"""
- if na_option not in {'keep', 'top', 'bottom'}:
+ if na_option not in {"keep", "top", "bottom"}:
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
raise ValueError(msg)
- return self._cython_transform('rank', numeric_only=False,
- ties_method=method, ascending=ascending,
- na_option=na_option, pct=pct, axis=axis)
-
- @Substitution(name='groupby')
+ return self._cython_transform(
+ "rank",
+ numeric_only=False,
+ ties_method=method,
+ ascending=ascending,
+ na_option=na_option,
+ pct=pct,
+ axis=axis,
+ )
+
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def cumprod(self, axis=0, *args, **kwargs):
"""
@@ -2037,14 +2092,13 @@ def cumprod(self, axis=0, *args, **kwargs):
-------
Series or DataFrame
"""
- nv.validate_groupby_func('cumprod', args, kwargs,
- ['numeric_only', 'skipna'])
+ nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])
if axis != 0:
return self.apply(lambda x: x.cumprod(axis=axis, **kwargs))
- return self._cython_transform('cumprod', **kwargs)
+ return self._cython_transform("cumprod", **kwargs)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def cumsum(self, axis=0, *args, **kwargs):
"""
@@ -2054,14 +2108,13 @@ def cumsum(self, axis=0, *args, **kwargs):
-------
Series or DataFrame
"""
- nv.validate_groupby_func('cumsum', args, kwargs,
- ['numeric_only', 'skipna'])
+ nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])
if axis != 0:
return self.apply(lambda x: x.cumsum(axis=axis, **kwargs))
- return self._cython_transform('cumsum', **kwargs)
+ return self._cython_transform("cumsum", **kwargs)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def cummin(self, axis=0, **kwargs):
"""
@@ -2074,9 +2127,9 @@ def cummin(self, axis=0, **kwargs):
if axis != 0:
return self.apply(lambda x: np.minimum.accumulate(x, axis))
- return self._cython_transform('cummin', numeric_only=False)
+ return self._cython_transform("cummin", numeric_only=False)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def cummax(self, axis=0, **kwargs):
"""
@@ -2089,14 +2142,22 @@ def cummax(self, axis=0, **kwargs):
if axis != 0:
return self.apply(lambda x: np.maximum.accumulate(x, axis))
- return self._cython_transform('cummax', numeric_only=False)
-
- def _get_cythonized_result(self, how, grouper, aggregate=False,
- cython_dtype=None, needs_values=False,
- needs_mask=False, needs_ngroups=False,
- result_is_index=False,
- pre_processing=None, post_processing=None,
- **kwargs):
+ return self._cython_transform("cummax", numeric_only=False)
+
+ def _get_cythonized_result(
+ self,
+ how,
+ grouper,
+ aggregate=False,
+ cython_dtype=None,
+ needs_values=False,
+ needs_mask=False,
+ needs_ngroups=False,
+ result_is_index=False,
+ pre_processing=None,
+ post_processing=None,
+ **kwargs
+ ):
"""
Get result for Cythonized functions.
@@ -2140,8 +2201,9 @@ def _get_cythonized_result(self, how, grouper, aggregate=False,
`Series` or `DataFrame` with filled values
"""
if result_is_index and aggregate:
- raise ValueError("'result_is_index' and 'aggregate' cannot both "
- "be True!")
+ raise ValueError(
+ "'result_is_index' and 'aggregate' cannot both " "be True!"
+ )
if post_processing:
if not callable(pre_processing):
raise ValueError("'post_processing' must be a callable!")
@@ -2149,8 +2211,9 @@ def _get_cythonized_result(self, how, grouper, aggregate=False,
if not callable(pre_processing):
raise ValueError("'pre_processing' must be a callable!")
if not needs_values:
- raise ValueError("Cannot use 'pre_processing' without "
- "specifying 'needs_values'!")
+ raise ValueError(
+ "Cannot use 'pre_processing' without " "specifying 'needs_values'!"
+ )
labels, _, ngroups = grouper.group_info
output = collections.OrderedDict()
@@ -2197,7 +2260,7 @@ def _get_cythonized_result(self, how, grouper, aggregate=False,
else:
return self._wrap_transformed_output(output)
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Appender(_common_see_also)
def shift(self, periods=1, freq=None, axis=0, fill_value=None):
"""
@@ -2220,19 +2283,20 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
"""
if freq is not None or axis != 0 or not isna(fill_value):
- return self.apply(lambda x: x.shift(periods, freq,
- axis, fill_value))
-
- return self._get_cythonized_result('group_shift_indexer',
- self.grouper, cython_dtype=np.int64,
- needs_ngroups=True,
- result_is_index=True,
- periods=periods)
-
- @Substitution(name='groupby')
+ return self.apply(lambda x: x.shift(periods, freq, axis, fill_value))
+
+ return self._get_cythonized_result(
+ "group_shift_indexer",
+ self.grouper,
+ cython_dtype=np.int64,
+ needs_ngroups=True,
+ result_is_index=True,
+ periods=periods,
+ )
+
+ @Substitution(name="groupby")
@Appender(_common_see_also)
- def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
- axis=0):
+ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0):
"""
Calculate pct_change of each value to previous entry in group.
@@ -2242,16 +2306,21 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
Percentage changes within each group.
"""
if freq is not None or axis != 0:
- return self.apply(lambda x: x.pct_change(periods=periods,
- fill_method=fill_method,
- limit=limit, freq=freq,
- axis=axis))
+ return self.apply(
+ lambda x: x.pct_change(
+ periods=periods,
+ fill_method=fill_method,
+ limit=limit,
+ freq=freq,
+ axis=axis,
+ )
+ )
filled = getattr(self, fill_method)(limit=limit)
fill_grp = filled.groupby(self.grouper.labels)
shifted = fill_grp.shift(periods=periods, freq=freq)
return (filled / shifted) - 1
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Substitution(see_also=_common_see_also)
def head(self, n=5):
"""
@@ -2282,7 +2351,7 @@ def head(self, n=5):
mask = self._cumcount_array() < n
return self._selected_obj[mask]
- @Substitution(name='groupby')
+ @Substitution(name="groupby")
@Substitution(see_also=_common_see_also)
def tail(self, n=5):
"""
@@ -2347,16 +2416,19 @@ def _reindex_output(self, output):
return output
# reindexing only applies to a Categorical grouper
- elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))
- for ping in groupings):
+ elif not any(
+ isinstance(ping.grouper, (Categorical, CategoricalIndex))
+ for ping in groupings
+ ):
return output
levels_list = [ping.group_index for ping in groupings]
index, _ = MultiIndex.from_product(
- levels_list, names=self.grouper.names).sortlevel()
+ levels_list, names=self.grouper.names
+ ).sortlevel()
if self.as_index:
- d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
+ d = {self.obj._get_axis_name(self.axis): index, "copy": False}
return output.reindex(**d)
# GH 13204
@@ -2370,15 +2442,15 @@ def _reindex_output(self, output):
# reindex `output`, and then reset the in-axis grouper columns.
# Select in-axis groupers
- in_axis_grps = ((i, ping.name) for (i, ping)
- in enumerate(groupings) if ping.in_axis)
+ in_axis_grps = (
+ (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
+ )
g_nums, g_names = zip(*in_axis_grps)
output = output.drop(labels=list(g_names), axis=1)
# Set a temp index and reindex (possibly expanding)
- output = output.set_index(self.grouper.result_index
- ).reindex(index, copy=False)
+ output = output.set_index(self.grouper.result_index).reindex(index, copy=False)
# Reset in-axis grouper columns
# (using level numbers `g_nums` because level names may not be unique)
@@ -2394,11 +2466,13 @@ def _reindex_output(self, output):
def groupby(obj, by, **kwds):
if isinstance(obj, Series):
from pandas.core.groupby.generic import SeriesGroupBy
+
klass = SeriesGroupBy
elif isinstance(obj, DataFrame):
from pandas.core.groupby.generic import DataFrameGroupBy
+
klass = DataFrameGroupBy
else: # pragma: no cover
- raise TypeError('invalid type: {}'.format(obj))
+ raise TypeError("invalid type: {}".format(obj))
return klass(obj, by, **kwds)
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
index 9e1033be26df2..818d844ca7994 100644
--- a/pandas/core/groupby/grouper.py
+++ b/pandas/core/groupby/grouper.py
@@ -11,8 +11,14 @@
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.common import (
- ensure_categorical, is_categorical_dtype, is_datetime64_dtype, is_hashable,
- is_list_like, is_scalar, is_timedelta64_dtype)
+ ensure_categorical,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_hashable,
+ is_list_like,
+ is_scalar,
+ is_timedelta64_dtype,
+)
from pandas.core.dtypes.generic import ABCSeries
import pandas.core.algorithms as algorithms
@@ -85,12 +91,13 @@ class Grouper:
>>> df.groupby(Grouper(level='date', freq='60s', axis=1))
"""
- _attributes = ('key', 'level', 'freq', 'axis',
- 'sort') # type: Tuple[str, ...]
+
+ _attributes = ("key", "level", "freq", "axis", "sort") # type: Tuple[str, ...]
def __new__(cls, *args, **kwargs):
- if kwargs.get('freq') is not None:
+ if kwargs.get("freq") is not None:
from pandas.core.resample import TimeGrouper
+
cls = TimeGrouper
return super().__new__(cls)
@@ -125,11 +132,14 @@ def _get_grouper(self, obj, validate=True):
"""
self._set_grouper(obj)
- self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key],
- axis=self.axis,
- level=self.level,
- sort=self.sort,
- validate=validate)
+ self.grouper, exclusions, self.obj = _get_grouper(
+ self.obj,
+ [self.key],
+ axis=self.axis,
+ level=self.level,
+ sort=self.sort,
+ validate=validate,
+ )
return self.binner, self.grouper, self.obj
def _set_grouper(self, obj, sort=False):
@@ -145,8 +155,7 @@ def _set_grouper(self, obj, sort=False):
"""
if self.key is not None and self.level is not None:
- raise ValueError(
- "The Grouper cannot specify both a key and a level!")
+ raise ValueError("The Grouper cannot specify both a key and a level!")
# Keep self.grouper value before overriding
if self._grouper is None:
@@ -156,13 +165,13 @@ def _set_grouper(self, obj, sort=False):
if self.key is not None:
key = self.key
# The 'on' is already defined
- if (getattr(self.grouper, 'name', None) == key and
- isinstance(obj, ABCSeries)):
+ if getattr(self.grouper, "name", None) == key and isinstance(
+ obj, ABCSeries
+ ):
ax = self._grouper.take(obj.index)
else:
if key not in obj._info_axis:
- raise KeyError(
- "The grouper name {0} is not found".format(key))
+ raise KeyError("The grouper name {0} is not found".format(key))
ax = Index(obj[key], name=key)
else:
@@ -174,18 +183,16 @@ def _set_grouper(self, obj, sort=False):
# equivalent to the axis name
if isinstance(ax, MultiIndex):
level = ax._get_level_number(level)
- ax = Index(ax._get_level_values(level),
- name=ax.names[level])
+ ax = Index(ax._get_level_values(level), name=ax.names[level])
else:
if level not in (0, ax.name):
- raise ValueError(
- "The level {0} is not valid".format(level))
+ raise ValueError("The level {0} is not valid".format(level))
# possibly sort
if (self.sort or sort) and not ax.is_monotonic:
# use stable sort to support first, last, nth
- indexer = self.indexer = ax.argsort(kind='mergesort')
+ indexer = self.indexer = ax.argsort(kind="mergesort")
ax = ax.take(indexer)
obj = obj._take(indexer, axis=self.axis, is_copy=False)
@@ -198,9 +205,11 @@ def groups(self):
return self.grouper.groups
def __repr__(self):
- attrs_list = ("{}={!r}".format(attr_name, getattr(self, attr_name))
- for attr_name in self._attributes
- if getattr(self, attr_name) is not None)
+ attrs_list = (
+ "{}={!r}".format(attr_name, getattr(self, attr_name))
+ for attr_name in self._attributes
+ if getattr(self, attr_name) is not None
+ )
attrs = ", ".join(attrs_list)
cls_name = self.__class__.__name__
return "{}({})".format(cls_name, attrs)
@@ -234,8 +243,17 @@ class Grouping:
* groups : dict of {group -> label_list}
"""
- def __init__(self, index, grouper=None, obj=None, name=None, level=None,
- sort=True, observed=False, in_axis=False):
+ def __init__(
+ self,
+ index,
+ grouper=None,
+ obj=None,
+ name=None,
+ level=None,
+ sort=True,
+ observed=False,
+ in_axis=False,
+ ):
self.name = name
self.level = level
@@ -260,14 +278,15 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
if level is not None:
if not isinstance(level, int):
if level not in index.names:
- raise AssertionError('Level {} not in index'.format(level))
+ raise AssertionError("Level {} not in index".format(level))
level = index.names.index(level)
if self.name is None:
self.name = index.names[level]
- self.grouper, self._labels, self._group_index = \
- index._get_grouper_for_level(self.grouper, level)
+ self.grouper, self._labels, self._group_index = index._get_grouper_for_level(
+ self.grouper, level
+ )
# a passed Grouper like, directly get the grouper in the same way
# as single grouper groupby, use the group_info to get labels
@@ -293,8 +312,10 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
elif is_categorical_dtype(self.grouper):
from pandas.core.groupby.categorical import recode_for_groupby
+
self.grouper, self.all_grouper = recode_for_groupby(
- self.grouper, self.sort, observed)
+ self.grouper, self.sort, observed
+ )
categories = self.grouper.categories
# we make a CategoricalIndex out of the cat grouper
@@ -310,42 +331,47 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
self._group_index = CategoricalIndex(
Categorical.from_codes(
- codes=codes,
- categories=categories,
- ordered=self.grouper.ordered))
+ codes=codes, categories=categories, ordered=self.grouper.ordered
+ )
+ )
# we are done
if isinstance(self.grouper, Grouping):
self.grouper = self.grouper.grouper
# no level passed
- elif not isinstance(self.grouper,
- (Series, Index, ExtensionArray, np.ndarray)):
- if getattr(self.grouper, 'ndim', 1) != 1:
+ elif not isinstance(
+ self.grouper, (Series, Index, ExtensionArray, np.ndarray)
+ ):
+ if getattr(self.grouper, "ndim", 1) != 1:
t = self.name or str(type(self.grouper))
- raise ValueError(
- "Grouper for '{}' not 1-dimensional".format(t))
+ raise ValueError("Grouper for '{}' not 1-dimensional".format(t))
self.grouper = self.index.map(self.grouper)
- if not (hasattr(self.grouper, "__len__") and
- len(self.grouper) == len(self.index)):
- errmsg = ('Grouper result violates len(labels) == '
- 'len(data)\nresult: %s' %
- pprint_thing(self.grouper))
+ if not (
+ hasattr(self.grouper, "__len__")
+ and len(self.grouper) == len(self.index)
+ ):
+ errmsg = (
+ "Grouper result violates len(labels) == "
+ "len(data)\nresult: %s" % pprint_thing(self.grouper)
+ )
self.grouper = None # Try for sanity
raise AssertionError(errmsg)
# if we have a date/time-like grouper, make sure that we have
# Timestamps like
- if getattr(self.grouper, 'dtype', None) is not None:
+ if getattr(self.grouper, "dtype", None) is not None:
if is_datetime64_dtype(self.grouper):
from pandas import to_datetime
+
self.grouper = to_datetime(self.grouper)
elif is_timedelta64_dtype(self.grouper):
from pandas import to_timedelta
+
self.grouper = to_timedelta(self.grouper)
def __repr__(self):
- return 'Grouping({0})'.format(self.name)
+ return "Grouping({0})".format(self.name)
def __iter__(self):
return iter(self.indices)
@@ -376,8 +402,8 @@ def labels(self):
def result_index(self):
if self.all_grouper is not None:
from pandas.core.groupby.categorical import recode_from_groupby
- return recode_from_groupby(self.all_grouper,
- self.sort, self.group_index)
+
+ return recode_from_groupby(self.all_grouper, self.sort, self.group_index)
return self.group_index
@property
@@ -393,20 +419,26 @@ def _make_labels(self):
labels = self.grouper.label_info
uniques = self.grouper.result_index
else:
- labels, uniques = algorithms.factorize(
- self.grouper, sort=self.sort)
+ labels, uniques = algorithms.factorize(self.grouper, sort=self.sort)
uniques = Index(uniques, name=self.name)
self._labels = labels
self._group_index = uniques
@cache_readonly
def groups(self):
- return self.index.groupby(Categorical.from_codes(self.labels,
- self.group_index))
-
-
-def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
- observed=False, mutated=False, validate=True):
+ return self.index.groupby(Categorical.from_codes(self.labels, self.group_index))
+
+
+def _get_grouper(
+ obj,
+ key=None,
+ axis=0,
+ level=None,
+ sort=True,
+ observed=False,
+ mutated=False,
+ validate=True,
+):
"""
create and return a BaseGrouper, which is an internal
mapping of how to create the grouper indexers.
@@ -460,18 +492,17 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
if nlevels == 1:
level = level[0]
elif nlevels == 0:
- raise ValueError('No group keys passed!')
+ raise ValueError("No group keys passed!")
else:
- raise ValueError('multiple levels only valid with '
- 'MultiIndex')
+ raise ValueError("multiple levels only valid with " "MultiIndex")
if isinstance(level, str):
if obj.index.name != level:
- raise ValueError('level name {} is not the name of the '
- 'index'.format(level))
+ raise ValueError(
+ "level name {} is not the name of the " "index".format(level)
+ )
elif level > 0 or level < -1:
- raise ValueError(
- 'level > 0 or level < -1 only valid with MultiIndex')
+ raise ValueError("level > 0 or level < -1 only valid with MultiIndex")
# NOTE: `group_axis` and `group_axis.get_level_values(level)`
# are same in this section.
@@ -501,13 +532,16 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
all_hashable = is_tuple and is_hashable(key)
if is_tuple:
- if ((all_hashable and key not in obj and set(key).issubset(obj))
- or not all_hashable):
+ if (
+ all_hashable and key not in obj and set(key).issubset(obj)
+ ) or not all_hashable:
# column names ('a', 'b') -> ['a', 'b']
# arrays like (a, b) -> [a, b]
- msg = ("Interpreting tuple 'by' as a list of keys, rather than "
- "a single key. Use 'by=[...]' instead of 'by=(...)'. In "
- "the future, a tuple will always mean a single key.")
+ msg = (
+ "Interpreting tuple 'by' as a list of keys, rather than "
+ "a single key. Use 'by=[...]' instead of 'by=(...)'. In "
+ "the future, a tuple will always mean a single key."
+ )
warnings.warn(msg, FutureWarning, stacklevel=5)
key = list(key)
@@ -521,15 +555,22 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
# what are we after, exactly?
any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
any_groupers = any(isinstance(g, Grouper) for g in keys)
- any_arraylike = any(isinstance(g, (list, tuple, Series, Index, np.ndarray))
- for g in keys)
+ any_arraylike = any(
+ isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys
+ )
# is this an index replacement?
- if (not any_callable and not any_arraylike and not any_groupers and
- match_axis_length and level is None):
+ if (
+ not any_callable
+ and not any_arraylike
+ and not any_groupers
+ and match_axis_length
+ and level is None
+ ):
if isinstance(obj, DataFrame):
- all_in_columns_index = all(g in obj.columns or g in
- obj.index.names for g in keys)
+ all_in_columns_index = all(
+ g in obj.columns or g in obj.index.names for g in keys
+ )
elif isinstance(obj, Series):
all_in_columns_index = all(g in obj.index.names for g in keys)
@@ -588,29 +629,37 @@ def is_in_obj(gpr):
if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]:
raise ValueError(
- ("Length of grouper ({len_gpr}) and axis ({len_axis})"
- " must be same length"
- .format(len_gpr=len(gpr), len_axis=obj.shape[axis])))
+ (
+ "Length of grouper ({len_gpr}) and axis ({len_axis})"
+ " must be same length".format(
+ len_gpr=len(gpr), len_axis=obj.shape[axis]
+ )
+ )
+ )
# create the Grouping
# allow us to passing the actual Grouping as the gpr
- ping = (Grouping(group_axis,
- gpr,
- obj=obj,
- name=name,
- level=level,
- sort=sort,
- observed=observed,
- in_axis=in_axis)
- if not isinstance(gpr, Grouping) else gpr)
+ ping = (
+ Grouping(
+ group_axis,
+ gpr,
+ obj=obj,
+ name=name,
+ level=level,
+ sort=sort,
+ observed=observed,
+ in_axis=in_axis,
+ )
+ if not isinstance(gpr, Grouping)
+ else gpr
+ )
groupings.append(ping)
if len(groupings) == 0 and len(obj):
- raise ValueError('No group keys passed!')
+ raise ValueError("No group keys passed!")
elif len(groupings) == 0:
- groupings.append(Grouping(Index([], dtype='int'),
- np.array([], dtype=np.intp)))
+ groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))
# create the internals grouper
grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated)
@@ -618,8 +667,7 @@ def is_in_obj(gpr):
def _is_label_like(val):
- return (isinstance(val, (str, tuple)) or
- (val is not None and is_scalar(val)))
+ return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))
def _convert_grouper(axis, grouper):
@@ -632,7 +680,7 @@ def _convert_grouper(axis, grouper):
return grouper.reindex(axis)._values
elif isinstance(grouper, (list, Series, Index, np.ndarray)):
if len(grouper) != len(axis):
- raise ValueError('Grouper and axis must be same length')
+ raise ValueError("Grouper and axis must be same length")
return grouper
else:
return grouper
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index dd44bc6990d59..33341a489866b 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -17,10 +17,21 @@
from pandas.util._decorators import cache_readonly
from pandas.core.dtypes.common import (
- ensure_float64, ensure_int64, ensure_int_or_float, ensure_object,
- ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_complex_dtype,
- is_datetime64_any_dtype, is_integer_dtype, is_numeric_dtype, is_sparse,
- is_timedelta64_dtype, needs_i8_conversion)
+ ensure_float64,
+ ensure_int64,
+ ensure_int_or_float,
+ ensure_object,
+ ensure_platform_int,
+ is_bool_dtype,
+ is_categorical_dtype,
+ is_complex_dtype,
+ is_datetime64_any_dtype,
+ is_integer_dtype,
+ is_numeric_dtype,
+ is_sparse,
+ is_timedelta64_dtype,
+ needs_i8_conversion,
+)
from pandas.core.dtypes.missing import _maybe_fill, isna
import pandas.core.algorithms as algorithms
@@ -32,8 +43,13 @@
from pandas.core.index import Index, MultiIndex, ensure_index
from pandas.core.series import Series
from pandas.core.sorting import (
- compress_group_index, decons_obs_group_ids, get_flattened_iterator,
- get_group_index, get_group_index_sorter, get_indexer_dict)
+ compress_group_index,
+ decons_obs_group_ids,
+ get_flattened_iterator,
+ get_group_index,
+ get_group_index_sorter,
+ get_indexer_dict,
+)
def generate_bins_generic(values, binner, closed):
@@ -78,8 +94,9 @@ def generate_bins_generic(values, binner, closed):
r_bin = binner[i + 1]
# count values in current bin, advance to next bin
- while j < lenidx and (values[j] < r_bin or
- (closed == 'right' and values[j] == r_bin)):
+ while j < lenidx and (
+ values[j] < r_bin or (closed == "right" and values[j] == r_bin)
+ ):
j += 1
bins[bc] = j
@@ -111,8 +128,9 @@ class BaseGrouper:
"""
- def __init__(self, axis, groupings, sort=True, group_keys=True,
- mutated=False, indexer=None):
+ def __init__(
+ self, axis, groupings, sort=True, group_keys=True, mutated=False, indexer=None
+ ):
self._filter_empty_groups = self.compressed = len(groupings) != 1
self.axis = axis
self.groupings = groupings
@@ -166,10 +184,7 @@ def _get_group_keys(self):
comp_ids, _, ngroups = self.group_info
# provide "flattened" iterator for multi-group setting
- return get_flattened_iterator(comp_ids,
- ngroups,
- self.levels,
- self.labels)
+ return get_flattened_iterator(comp_ids, ngroups, self.levels, self.labels)
def apply(self, f, data, axis=0):
mutated = self.mutated
@@ -179,8 +194,11 @@ def apply(self, f, data, axis=0):
# oh boy
f_name = com.get_callable_name(f)
- if (f_name not in base.plotting_methods and
- hasattr(splitter, 'fast_apply') and axis == 0):
+ if (
+ f_name not in base.plotting_methods
+ and hasattr(splitter, "fast_apply")
+ and axis == 0
+ ):
try:
result_values, mutated = splitter.fast_apply(f, group_keys)
@@ -199,7 +217,7 @@ def apply(self, f, data, axis=0):
pass
for key, (i, group) in zip(group_keys, splitter):
- object.__setattr__(group, 'name', key)
+ object.__setattr__(group, "name", key)
# result_values is None if fast apply path wasn't taken
# or fast apply aborted with an unexpected exception.
@@ -230,8 +248,7 @@ def indices(self):
return self.groupings[0].indices
else:
label_list = [ping.labels for ping in self.groupings]
- keys = [com.values_from_object(ping.group_index)
- for ping in self.groupings]
+ keys = [com.values_from_object(ping.group_index) for ping in self.groupings]
return get_indexer_dict(label_list, keys)
@property
@@ -257,9 +274,7 @@ def size(self):
out = np.bincount(ids[ids != -1], minlength=ngroup)
else:
out = []
- return Series(out,
- index=self.result_index,
- dtype='int64')
+ return Series(out, index=self.result_index, dtype="int64")
@cache_readonly
def groups(self):
@@ -296,8 +311,7 @@ def label_info(self):
def _get_compressed_labels(self):
all_labels = [ping.labels for ping in self.groupings]
if len(all_labels) > 1:
- group_index = get_group_index(all_labels, self.shape,
- sort=True, xnull=True)
+ group_index = get_group_index(all_labels, self.shape, sort=True, xnull=True)
return compress_group_index(group_index, sort=self.sort)
ping = self.groupings[0]
@@ -311,8 +325,7 @@ def ngroups(self):
def recons_labels(self):
comp_ids, obs_ids, _ = self.group_info
labels = (ping.labels for ping in self.groupings)
- return decons_obs_group_ids(
- comp_ids, obs_ids, self.shape, labels, xnull=True)
+ return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels, xnull=True)
@cache_readonly
def result_index(self):
@@ -321,10 +334,9 @@ def result_index(self):
codes = self.recons_labels
levels = [ping.result_index for ping in self.groupings]
- result = MultiIndex(levels=levels,
- codes=codes,
- verify_integrity=False,
- names=self.names)
+ result = MultiIndex(
+ levels=levels, codes=codes, verify_integrity=False, names=self.names
+ )
return result
def get_group_levels(self):
@@ -344,49 +356,45 @@ def get_group_levels(self):
# Aggregation functions
_cython_functions = {
- 'aggregate': {
- 'add': 'group_add',
- 'prod': 'group_prod',
- 'min': 'group_min',
- 'max': 'group_max',
- 'mean': 'group_mean',
- 'median': {
- 'name': 'group_median'
+ "aggregate": {
+ "add": "group_add",
+ "prod": "group_prod",
+ "min": "group_min",
+ "max": "group_max",
+ "mean": "group_mean",
+ "median": {"name": "group_median"},
+ "var": "group_var",
+ "first": {
+ "name": "group_nth",
+ "f": lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1),
},
- 'var': 'group_var',
- 'first': {
- 'name': 'group_nth',
- 'f': lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1)
+ "last": "group_last",
+ "ohlc": "group_ohlc",
+ },
+ "transform": {
+ "cumprod": "group_cumprod",
+ "cumsum": "group_cumsum",
+ "cummin": "group_cummin",
+ "cummax": "group_cummax",
+ "rank": {
+ "name": "group_rank",
+ "f": lambda func, a, b, c, d, e, **kwargs: func(
+ a,
+ b,
+ c,
+ e,
+ kwargs.get("ties_method", "average"),
+ kwargs.get("ascending", True),
+ kwargs.get("pct", False),
+ kwargs.get("na_option", "keep"),
+ ),
},
- 'last': 'group_last',
- 'ohlc': 'group_ohlc',
},
-
- 'transform': {
- 'cumprod': 'group_cumprod',
- 'cumsum': 'group_cumsum',
- 'cummin': 'group_cummin',
- 'cummax': 'group_cummax',
- 'rank': {
- 'name': 'group_rank',
- 'f': lambda func, a, b, c, d, e, **kwargs: func(
- a, b, c, e,
- kwargs.get('ties_method', 'average'),
- kwargs.get('ascending', True),
- kwargs.get('pct', False),
- kwargs.get('na_option', 'keep')
- )
- }
- }
}
- _cython_arity = {
- 'ohlc': 4, # OHLC
- }
+ _cython_arity = {"ohlc": 4} # OHLC
- _name_functions = {
- 'ohlc': lambda *args: ['open', 'high', 'low', 'close']
- }
+ _name_functions = {"ohlc": lambda *args: ["open", "high", "low", "close"]}
def _is_builtin_func(self, arg):
"""
@@ -407,19 +415,22 @@ def get_func(fname):
return f
# otherwise find dtype-specific version, falling back to object
- for dt in [dtype_str, 'object']:
- f = getattr(libgroupby, "{fname}_{dtype_str}".format(
- fname=fname, dtype_str=dt), None)
+ for dt in [dtype_str, "object"]:
+ f = getattr(
+ libgroupby,
+ "{fname}_{dtype_str}".format(fname=fname, dtype_str=dt),
+ None,
+ )
if f is not None:
return f
ftype = self._cython_functions[kind][how]
if isinstance(ftype, dict):
- func = afunc = get_func(ftype['name'])
+ func = afunc = get_func(ftype["name"])
# a sub-function
- f = ftype.get('f')
+ f = ftype.get("f")
if f is not None:
def wrapper(*args, **kwargs):
@@ -434,14 +445,13 @@ def wrapper(*args, **kwargs):
if func is None:
raise NotImplementedError(
"function is not implemented for this dtype: "
- "[how->{how},dtype->{dtype_str}]".format(how=how,
- dtype_str=dtype_str))
+ "[how->{how},dtype->{dtype_str}]".format(how=how, dtype_str=dtype_str)
+ )
return func
- def _cython_operation(self, kind, values, how, axis, min_count=-1,
- **kwargs):
- assert kind in ['transform', 'aggregate']
+ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs):
+ assert kind in ["transform", "aggregate"]
# can we do this operation with our cython functions
# if not raise NotImplementedError
@@ -453,17 +463,18 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1,
# are not setup for dim transforming
if is_categorical_dtype(values) or is_sparse(values):
raise NotImplementedError(
- "{} are not support in cython ops".format(values.dtype))
+ "{} are not support in cython ops".format(values.dtype)
+ )
elif is_datetime64_any_dtype(values):
- if how in ['add', 'prod', 'cumsum', 'cumprod']:
+ if how in ["add", "prod", "cumsum", "cumprod"]:
raise NotImplementedError(
- "datetime64 type does not support {} "
- "operations".format(how))
+ "datetime64 type does not support {} " "operations".format(how)
+ )
elif is_timedelta64_dtype(values):
- if how in ['prod', 'cumprod']:
+ if how in ["prod", "cumprod"]:
raise NotImplementedError(
- "timedelta64 type does not support {} "
- "operations".format(how))
+ "timedelta64 type does not support {} " "operations".format(how)
+ )
arity = self._cython_arity.get(how, 1)
@@ -478,15 +489,16 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1,
assert axis == 1, axis
values = values.T
if arity > 1:
- raise NotImplementedError("arity of more than 1 is not "
- "supported for the 'how' argument")
+ raise NotImplementedError(
+ "arity of more than 1 is not " "supported for the 'how' argument"
+ )
out_shape = (self.ngroups,) + values.shape[1:]
is_datetimelike = needs_i8_conversion(values.dtype)
is_numeric = is_numeric_dtype(values.dtype)
if is_datetimelike:
- values = values.view('int64')
+ values = values.view("int64")
is_numeric = True
elif is_bool_dtype(values.dtype):
values = ensure_float64(values)
@@ -503,59 +515,65 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1,
values = values.astype(object)
try:
- func = self._get_cython_function(
- kind, how, values, is_numeric)
+ func = self._get_cython_function(kind, how, values, is_numeric)
except NotImplementedError:
if is_numeric:
values = ensure_float64(values)
- func = self._get_cython_function(
- kind, how, values, is_numeric)
+ func = self._get_cython_function(kind, how, values, is_numeric)
else:
raise
- if how == 'rank':
- out_dtype = 'float'
+ if how == "rank":
+ out_dtype = "float"
else:
if is_numeric:
- out_dtype = '{kind}{itemsize}'.format(
- kind=values.dtype.kind, itemsize=values.dtype.itemsize)
+ out_dtype = "{kind}{itemsize}".format(
+ kind=values.dtype.kind, itemsize=values.dtype.itemsize
+ )
else:
- out_dtype = 'object'
+ out_dtype = "object"
labels, _, _ = self.group_info
- if kind == 'aggregate':
- result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
- fill_value=np.nan)
+ if kind == "aggregate":
+ result = _maybe_fill(
+ np.empty(out_shape, dtype=out_dtype), fill_value=np.nan
+ )
counts = np.zeros(self.ngroups, dtype=np.int64)
result = self._aggregate(
- result, counts, values, labels, func, is_numeric,
- is_datetimelike, min_count)
- elif kind == 'transform':
- result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
- fill_value=np.nan)
+ result,
+ counts,
+ values,
+ labels,
+ func,
+ is_numeric,
+ is_datetimelike,
+ min_count,
+ )
+ elif kind == "transform":
+ result = _maybe_fill(
+ np.empty_like(values, dtype=out_dtype), fill_value=np.nan
+ )
# TODO: min_count
result = self._transform(
- result, values, labels, func, is_numeric, is_datetimelike,
- **kwargs)
+ result, values, labels, func, is_numeric, is_datetimelike, **kwargs
+ )
if is_integer_dtype(result) and not is_datetimelike:
mask = result == iNaT
if mask.any():
- result = result.astype('float64')
+ result = result.astype("float64")
result[mask] = np.nan
- if (kind == 'aggregate' and
- self._filter_empty_groups and not counts.all()):
+ if kind == "aggregate" and self._filter_empty_groups and not counts.all():
if result.ndim == 2:
try:
- result = lib.row_bool_subset(
- result, (counts > 0).view(np.uint8))
+ result = lib.row_bool_subset(result, (counts > 0).view(np.uint8))
except ValueError:
result = lib.row_bool_subset_object(
- ensure_object(result),
- (counts > 0).view(np.uint8))
+ ensure_object(result), (counts > 0).view(np.uint8)
+ )
else:
result = result[counts > 0]
@@ -574,45 +592,69 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1,
return result, names
def aggregate(self, values, how, axis=0, min_count=-1):
- return self._cython_operation('aggregate', values, how, axis,
- min_count=min_count)
+ return self._cython_operation(
+ "aggregate", values, how, axis, min_count=min_count
+ )
def transform(self, values, how, axis=0, **kwargs):
- return self._cython_operation('transform', values, how, axis, **kwargs)
-
- def _aggregate(self, result, counts, values, comp_ids, agg_func,
- is_numeric, is_datetimelike, min_count=-1):
+ return self._cython_operation("transform", values, how, axis, **kwargs)
+
+ def _aggregate(
+ self,
+ result,
+ counts,
+ values,
+ comp_ids,
+ agg_func,
+ is_numeric,
+ is_datetimelike,
+ min_count=-1,
+ ):
if values.ndim > 3:
# punting for now
- raise NotImplementedError("number of dimensions is currently "
- "limited to 3")
+ raise NotImplementedError(
+ "number of dimensions is currently " "limited to 3"
+ )
elif values.ndim > 2:
for i, chunk in enumerate(values.transpose(2, 0, 1)):
chunk = chunk.squeeze()
- agg_func(result[:, :, i], counts, chunk, comp_ids,
- min_count)
+ agg_func(result[:, :, i], counts, chunk, comp_ids, min_count)
else:
agg_func(result, counts, values, comp_ids, min_count)
return result
- def _transform(self, result, values, comp_ids, transform_func,
- is_numeric, is_datetimelike, **kwargs):
+ def _transform(
+ self,
+ result,
+ values,
+ comp_ids,
+ transform_func,
+ is_numeric,
+ is_datetimelike,
+ **kwargs
+ ):
comp_ids, _, ngroups = self.group_info
if values.ndim > 3:
# punting for now
- raise NotImplementedError("number of dimensions is currently "
- "limited to 3")
+ raise NotImplementedError(
+ "number of dimensions is currently " "limited to 3"
+ )
elif values.ndim > 2:
for i, chunk in enumerate(values.transpose(2, 0, 1)):
- transform_func(result[:, :, i], values,
- comp_ids, ngroups, is_datetimelike, **kwargs)
+ transform_func(
+ result[:, :, i],
+ values,
+ comp_ids,
+ ngroups,
+ is_datetimelike,
+ **kwargs
+ )
else:
- transform_func(result, values, comp_ids, ngroups, is_datetimelike,
- **kwargs)
+ transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)
return result
@@ -626,7 +668,7 @@ def _aggregate_series_fast(self, obj, func):
func = self._is_builtin_func(func)
if obj.index._has_complex_internals:
- raise TypeError('Incompatible index for Cython grouper')
+ raise TypeError("Incompatible index for Cython grouper")
group_index, _, ngroups = self.group_info
@@ -634,10 +676,8 @@ def _aggregate_series_fast(self, obj, func):
dummy = obj._get_values(slice(None, 0))
indexer = get_group_index_sorter(group_index, ngroups)
obj = obj._take(indexer)
- group_index = algorithms.take_nd(
- group_index, indexer, allow_fill=False)
- grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups,
- dummy)
+ group_index = algorithms.take_nd(group_index, indexer, allow_fill=False)
+ grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, dummy)
result, counts = grouper.get_result()
return result, counts
@@ -653,9 +693,9 @@ def _aggregate_series_pure_python(self, obj, func):
for label, group in splitter:
res = func(group)
if result is None:
- if (isinstance(res, (Series, Index, np.ndarray))):
- raise ValueError('Function does not reduce')
- result = np.empty(ngroups, dtype='O')
+ if isinstance(res, (Series, Index, np.ndarray)):
+ raise ValueError("Function does not reduce")
+ result = np.empty(ngroups, dtype="O")
counts[label] = group.shape[0]
result[label] = res
@@ -695,8 +735,9 @@ class BinGrouper(BaseGrouper):
"""
- def __init__(self, bins, binlabels, filter_empty=False, mutated=False,
- indexer=None):
+ def __init__(
+ self, bins, binlabels, filter_empty=False, mutated=False, indexer=None
+ ):
self.bins = ensure_int64(bins)
self.binlabels = ensure_index(binlabels)
self._filter_empty_groups = filter_empty
@@ -709,8 +750,11 @@ def groups(self):
# this is mainly for compat
# GH 3881
- result = {key: value for key, value in zip(self.binlabels, self.bins)
- if key is not NaT}
+ result = {
+ key: value
+ for key, value in zip(self.binlabels, self.bins)
+ if key is not NaT
+ }
return result
@property
@@ -736,8 +780,7 @@ def get_iterator(self, data, axis=0):
for each group
"""
if isinstance(data, NDFrame):
- slicer = lambda start, edge: data._slice(
- slice(start, edge), axis=axis)
+ slicer = lambda start, edge: data._slice(slice(start, edge), axis=axis)
length = len(data.axes[axis])
else:
slicer = lambda start, edge: data[slice(start, edge)]
@@ -776,9 +819,11 @@ def group_info(self):
else:
comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep)
- return (comp_ids.astype('int64', copy=False),
- obs_group_ids.astype('int64', copy=False),
- ngroups)
+ return (
+ comp_ids.astype("int64", copy=False),
+ obs_group_ids.astype("int64", copy=False),
+ ngroups,
+ )
@cache_readonly
def result_index(self):
@@ -798,8 +843,11 @@ def names(self):
@property
def groupings(self):
from pandas.core.groupby.grouper import Grouping
- return [Grouping(lvl, lvl, in_axis=False, level=None, name=name)
- for lvl, name in zip(self.levels, self.names)]
+
+ return [
+ Grouping(lvl, lvl, in_axis=False, level=None, name=name)
+ for lvl, name in zip(self.levels, self.names)
+ ]
def agg_series(self, obj, func):
dummy = obj[:0]
@@ -830,7 +878,6 @@ def _is_indexed_like(obj, axes):
class DataSplitter:
-
def __init__(self, data, labels, ngroups, axis=0):
self.data = data
self.labels = ensure_int64(labels)
@@ -878,13 +925,11 @@ def apply(self, f):
class SeriesSplitter(DataSplitter):
-
def _chop(self, sdata, slice_obj):
return sdata._get_values(slice_obj)
class FrameSplitter(DataSplitter):
-
def fast_apply(self, f, names):
# must return keys::list, values::list, mutated::bool
try:
diff --git a/pandas/core/index.py b/pandas/core/index.py
index f14f32c67d4e1..d308ac1a9b1c7 100644
--- a/pandas/core/index.py
+++ b/pandas/core/index.py
@@ -1,7 +1,25 @@
from pandas.core.indexes.api import ( # noqa:F401
- CategoricalIndex, DatetimeIndex, Float64Index, Index, Int64Index,
- IntervalIndex, InvalidIndexError, MultiIndex, NaT, NumericIndex,
- PeriodIndex, RangeIndex, TimedeltaIndex, UInt64Index, _all_indexes_same,
- _get_combined_index, _get_consensus_names, _get_objs_combined_axis,
- _new_Index, _union_indexes, ensure_index, ensure_index_from_sequences)
+ CategoricalIndex,
+ DatetimeIndex,
+ Float64Index,
+ Index,
+ Int64Index,
+ IntervalIndex,
+ InvalidIndexError,
+ MultiIndex,
+ NaT,
+ NumericIndex,
+ PeriodIndex,
+ RangeIndex,
+ TimedeltaIndex,
+ UInt64Index,
+ _all_indexes_same,
+ _get_combined_index,
+ _get_consensus_names,
+ _get_objs_combined_axis,
+ _new_Index,
+ _union_indexes,
+ ensure_index,
+ ensure_index_from_sequences,
+)
from pandas.core.indexes.multi import _sparsify # noqa:F401
diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py
index 602e11a08b4ed..5ba23990cbd51 100644
--- a/pandas/core/indexes/accessors.py
+++ b/pandas/core/indexes/accessors.py
@@ -4,9 +4,15 @@
import numpy as np
from pandas.core.dtypes.common import (
- is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
- is_datetime_arraylike, is_integer_dtype, is_list_like, is_period_arraylike,
- is_timedelta64_dtype)
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_datetime_arraylike,
+ is_integer_dtype,
+ is_list_like,
+ is_period_arraylike,
+ is_timedelta64_dtype,
+)
from pandas.core.dtypes.generic import ABCSeries
from pandas.core.accessor import PandasDelegate, delegate_names
@@ -18,15 +24,16 @@
class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin):
-
def __init__(self, data, orig):
if not isinstance(data, ABCSeries):
- raise TypeError("cannot convert an object of type {0} to a "
- "datetimelike index".format(type(data)))
+ raise TypeError(
+ "cannot convert an object of type {0} to a "
+ "datetimelike index".format(type(data))
+ )
self._parent = data
self.orig = orig
- self.name = getattr(data, 'name', None)
+ self.name = getattr(data, "name", None)
self._freeze()
def _get_values(self):
@@ -47,11 +54,14 @@ def _get_values(self):
if is_datetime_arraylike(data):
return DatetimeIndex(data, copy=False, name=self.name)
- raise TypeError("cannot convert an object of type {0} to a "
- "datetimelike index".format(type(data)))
+ raise TypeError(
+ "cannot convert an object of type {0} to a "
+ "datetimelike index".format(type(data))
+ )
def _delegate_property_get(self, name):
from pandas import Series
+
values = self._get_values()
result = getattr(values, name)
@@ -59,7 +69,7 @@ def _delegate_property_get(self, name):
# maybe need to upcast (ints)
if isinstance(result, np.ndarray):
if is_integer_dtype(result):
- result = result.astype('int64')
+ result = result.astype("int64")
elif not is_list_like(result):
return result
@@ -75,19 +85,24 @@ def _delegate_property_get(self, name):
result = Series(result, index=index, name=self.name)
# setting this object will show a SettingWithCopyWarning/Error
- result._is_copy = ("modifications to a property of a datetimelike "
- "object are not supported and are discarded. "
- "Change values on the original.")
+ result._is_copy = (
+ "modifications to a property of a datetimelike "
+ "object are not supported and are discarded. "
+ "Change values on the original."
+ )
return result
def _delegate_property_set(self, name, value, *args, **kwargs):
- raise ValueError("modifications to a property of a datetimelike "
- "object are not supported. Change values on the "
- "original.")
+ raise ValueError(
+ "modifications to a property of a datetimelike "
+ "object are not supported. Change values on the "
+ "original."
+ )
def _delegate_method(self, name, *args, **kwargs):
from pandas import Series
+
values = self._get_values()
method = getattr(values, name)
@@ -99,19 +114,21 @@ def _delegate_method(self, name, *args, **kwargs):
result = Series(result, index=self._parent.index, name=self.name)
# setting this object will show a SettingWithCopyWarning/Error
- result._is_copy = ("modifications to a method of a datetimelike "
- "object are not supported and are discarded. "
- "Change values on the original.")
+ result._is_copy = (
+ "modifications to a method of a datetimelike "
+ "object are not supported and are discarded. "
+ "Change values on the original."
+ )
return result
-@delegate_names(delegate=DatetimeArray,
- accessors=DatetimeArray._datetimelike_ops,
- typ="property")
-@delegate_names(delegate=DatetimeArray,
- accessors=DatetimeArray._datetimelike_methods,
- typ="method")
+@delegate_names(
+ delegate=DatetimeArray, accessors=DatetimeArray._datetimelike_ops, typ="property"
+)
+@delegate_names(
+ delegate=DatetimeArray, accessors=DatetimeArray._datetimelike_methods, typ="method"
+)
class DatetimeProperties(Properties):
"""
Accessor object for datetimelike properties of the Series values.
@@ -177,12 +194,14 @@ def freq(self):
return self._get_values().inferred_freq
-@delegate_names(delegate=TimedeltaArray,
- accessors=TimedeltaArray._datetimelike_ops,
- typ="property")
-@delegate_names(delegate=TimedeltaArray,
- accessors=TimedeltaArray._datetimelike_methods,
- typ="method")
+@delegate_names(
+ delegate=TimedeltaArray, accessors=TimedeltaArray._datetimelike_ops, typ="property"
+)
+@delegate_names(
+ delegate=TimedeltaArray,
+ accessors=TimedeltaArray._datetimelike_methods,
+ typ="method",
+)
class TimedeltaProperties(Properties):
"""
Accessor object for datetimelike properties of the Series values.
@@ -266,12 +285,12 @@ def freq(self):
return self._get_values().inferred_freq
-@delegate_names(delegate=PeriodArray,
- accessors=PeriodArray._datetimelike_ops,
- typ="property")
-@delegate_names(delegate=PeriodArray,
- accessors=PeriodArray._datetimelike_methods,
- typ="method")
+@delegate_names(
+ delegate=PeriodArray, accessors=PeriodArray._datetimelike_ops, typ="property"
+)
+@delegate_names(
+ delegate=PeriodArray, accessors=PeriodArray._datetimelike_methods, typ="method"
+)
class PeriodProperties(Properties):
"""
Accessor object for datetimelike properties of the Series values.
@@ -287,9 +306,9 @@ class PeriodProperties(Properties):
"""
-class CombinedDatetimelikeProperties(DatetimeProperties,
- TimedeltaProperties, PeriodProperties):
-
+class CombinedDatetimelikeProperties(
+ DatetimeProperties, TimedeltaProperties, PeriodProperties
+):
def __new__(cls, data):
# CombinedDatetimelikeProperties isn't really instantiated. Instead
# we need to choose which parent (datetime or timedelta) is
@@ -298,14 +317,14 @@ def __new__(cls, data):
from pandas import Series
if not isinstance(data, Series):
- raise TypeError("cannot convert an object of type {0} to a "
- "datetimelike index".format(type(data)))
+ raise TypeError(
+ "cannot convert an object of type {0} to a "
+ "datetimelike index".format(type(data))
+ )
orig = data if is_categorical_dtype(data) else None
if orig is not None:
- data = Series(orig.values.categories,
- name=orig.name,
- copy=False)
+ data = Series(orig.values.categories, name=orig.name, copy=False)
try:
if is_datetime64_dtype(data.dtype):
@@ -321,5 +340,4 @@ def __new__(cls, data):
except Exception:
pass # we raise an attribute error anyway
- raise AttributeError("Can only use .dt accessor with datetimelike "
- "values")
+ raise AttributeError("Can only use .dt accessor with datetimelike " "values")
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
index 6299fc482d0df..a17f74286d59f 100644
--- a/pandas/core/indexes/api.py
+++ b/pandas/core/indexes/api.py
@@ -5,40 +5,64 @@
import pandas.core.common as com
from pandas.core.indexes.base import (
- Index, _new_Index, ensure_index, ensure_index_from_sequences)
+ Index,
+ _new_Index,
+ ensure_index,
+ ensure_index_from_sequences,
+)
from pandas.core.indexes.base import InvalidIndexError # noqa:F401
from pandas.core.indexes.category import CategoricalIndex # noqa:F401
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.interval import IntervalIndex # noqa:F401
from pandas.core.indexes.multi import MultiIndex # noqa:F401
from pandas.core.indexes.numeric import ( # noqa:F401
- Float64Index, Int64Index, NumericIndex, UInt64Index)
+ Float64Index,
+ Int64Index,
+ NumericIndex,
+ UInt64Index,
+)
from pandas.core.indexes.period import PeriodIndex
from pandas.core.indexes.range import RangeIndex # noqa:F401
from pandas.core.indexes.timedeltas import TimedeltaIndex
-_sort_msg = textwrap.dedent("""\
+_sort_msg = textwrap.dedent(
+ """\
Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
-""")
+"""
+)
# TODO: there are many places that rely on these private methods existing in
# pandas.core.index
-__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
- 'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index',
- 'InvalidIndexError', 'TimedeltaIndex',
- 'PeriodIndex', 'DatetimeIndex',
- '_new_Index', 'NaT',
- 'ensure_index', 'ensure_index_from_sequences',
- '_get_combined_index',
- '_get_objs_combined_axis', '_union_indexes',
- '_get_consensus_names',
- '_all_indexes_same']
+__all__ = [
+ "Index",
+ "MultiIndex",
+ "NumericIndex",
+ "Float64Index",
+ "Int64Index",
+ "CategoricalIndex",
+ "IntervalIndex",
+ "RangeIndex",
+ "UInt64Index",
+ "InvalidIndexError",
+ "TimedeltaIndex",
+ "PeriodIndex",
+ "DatetimeIndex",
+ "_new_Index",
+ "NaT",
+ "ensure_index",
+ "ensure_index_from_sequences",
+ "_get_combined_index",
+ "_get_objs_combined_axis",
+ "_union_indexes",
+ "_get_consensus_names",
+ "_all_indexes_same",
+]
def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):
@@ -64,8 +88,7 @@ def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):
-------
Index
"""
- obs_idxes = [obj._get_axis(axis) for obj in objs
- if hasattr(obj, '_get_axis')]
+ obs_idxes = [obj._get_axis(axis) for obj in objs if hasattr(obj, "_get_axis")]
if obs_idxes:
return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)
@@ -142,7 +165,7 @@ def _union_indexes(indexes, sort=True):
Index
"""
if len(indexes) == 0:
- raise AssertionError('Must have at least 1 Index to union')
+ raise AssertionError("Must have at least 1 Index to union")
if len(indexes) == 1:
result = indexes[0]
if isinstance(result, list):
@@ -165,24 +188,24 @@ def _unique_indices(inds):
-------
Index
"""
+
def conv(i):
if isinstance(i, Index):
i = i.tolist()
return i
- return Index(
- lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))
+ return Index(lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))
- if kind == 'special':
+ if kind == "special":
result = indexes[0]
- if hasattr(result, 'union_many'):
+ if hasattr(result, "union_many"):
return result.union_many(indexes[1:])
else:
for other in indexes[1:]:
result = result.union(other)
return result
- elif kind == 'array':
+ elif kind == "array":
index = indexes[0]
for other in indexes[1:]:
if not index.equals(other):
@@ -227,17 +250,18 @@ def _sanitize_and_check(indexes):
if list in kinds:
if len(kinds) > 1:
- indexes = [Index(com.try_sort(x))
- if not isinstance(x, Index) else
- x for x in indexes]
+ indexes = [
+ Index(com.try_sort(x)) if not isinstance(x, Index) else x
+ for x in indexes
+ ]
kinds.remove(list)
else:
- return indexes, 'list'
+ return indexes, "list"
if len(kinds) > 1 or Index not in kinds:
- return indexes, 'special'
+ return indexes, "special"
else:
- return indexes, 'array'
+ return indexes, "array"
def _get_consensus_names(indexes):
@@ -259,8 +283,7 @@ def _get_consensus_names(indexes):
# find the non-none names, need to tupleify to make
# the set hashable, then reverse on return
- consensus_names = {tuple(i.names) for i in indexes
- if com._any_not_none(*i.names)}
+ consensus_names = {tuple(i.names) for i in indexes if com._any_not_none(*i.names)}
if len(consensus_names) == 1:
return list(list(consensus_names)[0])
return [None] * indexes[0].nlevels
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 6e0d26750df00..973a022cfc3f1 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -17,18 +17,47 @@
from pandas.core.dtypes.cast import maybe_cast_to_integer_array
from pandas.core.dtypes.common import (
- ensure_categorical, ensure_int64, ensure_object, ensure_platform_int,
- is_bool, is_bool_dtype, is_categorical, is_categorical_dtype,
- is_datetime64_any_dtype, is_datetime64tz_dtype, is_dtype_equal,
- is_extension_array_dtype, is_float, is_float_dtype, is_hashable,
- is_integer, is_integer_dtype, is_interval_dtype, is_iterator, is_list_like,
- is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype,
- is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype)
+ ensure_categorical,
+ ensure_int64,
+ ensure_object,
+ ensure_platform_int,
+ is_bool,
+ is_bool_dtype,
+ is_categorical,
+ is_categorical_dtype,
+ is_datetime64_any_dtype,
+ is_datetime64tz_dtype,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_float,
+ is_float_dtype,
+ is_hashable,
+ is_integer,
+ is_integer_dtype,
+ is_interval_dtype,
+ is_iterator,
+ is_list_like,
+ is_object_dtype,
+ is_period_dtype,
+ is_scalar,
+ is_signed_integer_dtype,
+ is_timedelta64_dtype,
+ is_unsigned_integer_dtype,
+ pandas_dtype,
+)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCDateOffset, ABCDatetimeArray, ABCIndexClass,
- ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, ABCSeries,
- ABCTimedeltaArray, ABCTimedeltaIndex)
+ ABCDataFrame,
+ ABCDateOffset,
+ ABCDatetimeArray,
+ ABCIndexClass,
+ ABCMultiIndex,
+ ABCPandasArray,
+ ABCPeriodIndex,
+ ABCSeries,
+ ABCTimedeltaArray,
+ ABCTimedeltaIndex,
+)
from pandas.core.dtypes.missing import array_equivalent, isna
from pandas.core import ops
@@ -44,16 +73,24 @@
from pandas.core.strings import StringMethods
from pandas.io.formats.printing import (
- default_pprint, format_object_attrs, format_object_summary, pprint_thing)
-
-__all__ = ['Index']
-
-_unsortable_types = frozenset(('mixed', 'mixed-integer'))
-
-_index_doc_kwargs = dict(klass='Index', inplace='',
- target_klass='Index',
- raises_section='',
- unique='Index', duplicated='np.ndarray')
+ default_pprint,
+ format_object_attrs,
+ format_object_summary,
+ pprint_thing,
+)
+
+__all__ = ["Index"]
+
+_unsortable_types = frozenset(("mixed", "mixed-integer"))
+
+_index_doc_kwargs = dict(
+ klass="Index",
+ inplace="",
+ target_klass="Index",
+ raises_section="",
+ unique="Index",
+ duplicated="np.ndarray",
+)
_index_shared_docs = dict()
@@ -61,15 +98,15 @@ def _make_comparison_op(op, cls):
def cmp_method(self, other):
if isinstance(other, (np.ndarray, Index, ABCSeries)):
if other.ndim > 0 and len(self) != len(other):
- raise ValueError('Lengths must match to compare')
+ raise ValueError("Lengths must match to compare")
if is_object_dtype(self) and not isinstance(self, ABCMultiIndex):
# don't pass MultiIndex
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = ops._comp_method_OBJECT_ARRAY(op, self.values, other)
else:
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = op(self.values, np.asarray(other))
# technically we could support bool dtyped Index
@@ -81,7 +118,7 @@ def cmp_method(self, other):
except TypeError:
return result
- name = '__{name}__'.format(name=op.__name__)
+ name = "__{name}__".format(name=op.__name__)
# TODO: docstring?
return set_function_name(cmp_method, name, cls)
@@ -93,12 +130,14 @@ def index_arithmetic_method(self, other):
elif isinstance(other, ABCTimedeltaIndex):
# Defer to subclass implementation
return NotImplemented
- elif (isinstance(other, (np.ndarray, ABCTimedeltaArray)) and
- is_timedelta64_dtype(other)):
+ elif isinstance(
+ other, (np.ndarray, ABCTimedeltaArray)
+ ) and is_timedelta64_dtype(other):
# GH#22390; wrap in Series for op, this will in turn wrap in
# TimedeltaIndex, but will correctly raise TypeError instead of
# NullFrequencyError for add/sub ops
from pandas import Series
+
other = Series(other)
out = op(self, other)
return Index(out, name=self.name)
@@ -112,7 +151,7 @@ def index_arithmetic_method(self, other):
return self._evaluate_with_datetime_like(other, op)
values = self.values
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = op(values, other)
result = missing.dispatch_missing(op, values, other, result)
@@ -125,7 +164,7 @@ def index_arithmetic_method(self, other):
result = Index(result, **attrs)
return result
- name = '__{name}__'.format(name=op.__name__)
+ name = "__{name}__".format(name=op.__name__)
# TODO: docstring?
return set_function_name(index_arithmetic_method, name, cls)
@@ -147,6 +186,7 @@ def _new_Index(cls, d):
# ordinals through __new__ GH #13277
if issubclass(cls, ABCPeriodIndex):
from pandas.core.indexes.period import _new_PeriodIndex
+
return _new_PeriodIndex(cls, **d)
return cls.__new__(cls, **d)
@@ -191,8 +231,9 @@ class Index(IndexOpsMixin, PandasObject):
>>> pd.Index(list('abc'))
Index(['a', 'b', 'c'], dtype='object')
"""
+
# tolist is not actually deprecated, just suppressed in the __dir__
- _deprecations = DirNamesMixin._deprecations | frozenset(['tolist'])
+ _deprecations = DirNamesMixin._deprecations | frozenset(["tolist"])
# To hand over control to subclasses
_join_precedence = 1
@@ -213,12 +254,12 @@ def _inner_indexer(self, left, right):
def _outer_indexer(self, left, right):
return libjoin.outer_join_indexer(left, right)
- _typ = 'index'
+ _typ = "index"
_data = None
_id = None
name = None
- _comparables = ['name']
- _attributes = ['name']
+ _comparables = ["name"]
+ _attributes = ["name"]
_is_numeric_dtype = False
_can_hold_na = True
@@ -231,27 +272,39 @@ def _outer_indexer(self, left, right):
_engine_type = libindex.ObjectEngine
- _accessors = {'str'}
+ _accessors = {"str"}
str = CachedAccessor("str", StringMethods)
# --------------------------------------------------------------------
# Constructors
- def __new__(cls, data=None, dtype=None, copy=False, name=None,
- fastpath=None, tupleize_cols=True, **kwargs):
+ def __new__(
+ cls,
+ data=None,
+ dtype=None,
+ copy=False,
+ name=None,
+ fastpath=None,
+ tupleize_cols=True,
+ **kwargs
+ ):
- if name is None and hasattr(data, 'name'):
+ if name is None and hasattr(data, "name"):
name = data.name
if fastpath is not None:
- warnings.warn("The 'fastpath' keyword is deprecated, and will be "
- "removed in a future version.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "The 'fastpath' keyword is deprecated, and will be "
+ "removed in a future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
if fastpath:
return cls._simple_new(data, name)
from .range import RangeIndex
+
if isinstance(data, ABCPandasArray):
# ensure users don't accidentally put a PandasArray in an index.
data = data.to_numpy()
@@ -265,20 +318,23 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
# categorical
elif is_categorical_dtype(data) or is_categorical_dtype(dtype):
from .category import CategoricalIndex
- return CategoricalIndex(data, dtype=dtype, copy=copy, name=name,
- **kwargs)
+
+ return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs)
# interval
- elif ((is_interval_dtype(data) or is_interval_dtype(dtype)) and
- not is_object_dtype(dtype)):
+ elif (
+ is_interval_dtype(data) or is_interval_dtype(dtype)
+ ) and not is_object_dtype(dtype):
from .interval import IntervalIndex
- closed = kwargs.get('closed', None)
- return IntervalIndex(data, dtype=dtype, name=name, copy=copy,
- closed=closed)
- elif (is_datetime64_any_dtype(data) or
- (dtype is not None and is_datetime64_any_dtype(dtype)) or
- 'tz' in kwargs):
+ closed = kwargs.get("closed", None)
+ return IntervalIndex(data, dtype=dtype, name=name, copy=copy, closed=closed)
+
+ elif (
+ is_datetime64_any_dtype(data)
+ or (dtype is not None and is_datetime64_any_dtype(dtype))
+ or "tz" in kwargs
+ ):
from pandas import DatetimeIndex
if dtype is not None and is_dtype_equal(_o_dtype, dtype):
@@ -291,25 +347,30 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
result = DatetimeIndex(data, copy=False, name=name, **kwargs)
return result.astype(object)
else:
- result = DatetimeIndex(data, copy=copy, name=name,
- dtype=dtype, **kwargs)
+ result = DatetimeIndex(
+ data, copy=copy, name=name, dtype=dtype, **kwargs
+ )
return result
- elif (is_timedelta64_dtype(data) or
- (dtype is not None and is_timedelta64_dtype(dtype))):
+ elif is_timedelta64_dtype(data) or (
+ dtype is not None and is_timedelta64_dtype(dtype)
+ ):
from pandas import TimedeltaIndex
+
if dtype is not None and is_dtype_equal(_o_dtype, dtype):
# Note we can pass copy=False because the .astype below
# will always make a copy
result = TimedeltaIndex(data, copy=False, name=name, **kwargs)
return result.astype(object)
else:
- result = TimedeltaIndex(data, copy=copy, name=name,
- dtype=dtype, **kwargs)
+ result = TimedeltaIndex(
+ data, copy=copy, name=name, dtype=dtype, **kwargs
+ )
return result
elif is_period_dtype(data) and not is_object_dtype(dtype):
from pandas import PeriodIndex
+
result = PeriodIndex(data, copy=copy, name=name, **kwargs)
return result
@@ -320,12 +381,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
# coerce to the provided dtype
data = dtype.construct_array_type()._from_sequence(
- data, dtype=dtype, copy=False)
+ data, dtype=dtype, copy=False
+ )
# coerce to the object dtype
data = data.astype(object)
- return Index(data, dtype=object, copy=copy, name=name,
- **kwargs)
+ return Index(data, dtype=object, copy=copy, name=name, **kwargs)
# index-like
elif isinstance(data, (np.ndarray, Index, ABCSeries)):
@@ -339,13 +400,13 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
# GH 11836
if is_integer_dtype(dtype):
inferred = lib.infer_dtype(data, skipna=False)
- if inferred == 'integer':
- data = maybe_cast_to_integer_array(data, dtype,
- copy=copy)
- elif inferred in ['floating', 'mixed-integer-float']:
+ if inferred == "integer":
+ data = maybe_cast_to_integer_array(data, dtype, copy=copy)
+ elif inferred in ["floating", "mixed-integer-float"]:
if isna(data).any():
- raise ValueError('cannot convert float '
- 'NaN to integer')
+ raise ValueError(
+ "cannot convert float " "NaN to integer"
+ )
if inferred == "mixed-integer-float":
data = maybe_cast_to_integer_array(data, dtype)
@@ -354,22 +415,23 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
# then coerce to integer.
try:
return cls._try_convert_to_int_index(
- data, copy, name, dtype)
+ data, copy, name, dtype
+ )
except ValueError:
pass
# Return an actual float index.
from .numeric import Float64Index
- return Float64Index(data, copy=copy, dtype=dtype,
- name=name)
- elif inferred == 'string':
+ return Float64Index(data, copy=copy, dtype=dtype, name=name)
+
+ elif inferred == "string":
pass
else:
data = data.astype(dtype)
elif is_float_dtype(dtype):
inferred = lib.infer_dtype(data, skipna=False)
- if inferred == 'string':
+ if inferred == "string":
pass
else:
data = data.astype(dtype)
@@ -378,25 +440,29 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
except (TypeError, ValueError) as e:
msg = str(e)
- if ("cannot convert float" in msg or
- "Trying to coerce float values to integer" in msg):
+ if (
+ "cannot convert float" in msg
+ or "Trying to coerce float values to integer" in msg
+ ):
raise
# maybe coerce to a sub-class
- from pandas.core.indexes.period import (
- PeriodIndex, IncompatibleFrequency)
+ from pandas.core.indexes.period import PeriodIndex, IncompatibleFrequency
if is_signed_integer_dtype(data.dtype):
from .numeric import Int64Index
+
return Int64Index(data, copy=copy, dtype=dtype, name=name)
elif is_unsigned_integer_dtype(data.dtype):
from .numeric import UInt64Index
+
return UInt64Index(data, copy=copy, dtype=dtype, name=name)
elif is_float_dtype(data.dtype):
from .numeric import Float64Index
+
return Float64Index(data, copy=copy, dtype=dtype, name=name)
elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data):
- subarr = data.astype('object')
+ subarr = data.astype("object")
else:
subarr = com.asarray_tuplesafe(data, dtype=object)
@@ -407,54 +473,57 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
if dtype is None:
inferred = lib.infer_dtype(subarr, skipna=False)
- if inferred == 'integer':
+ if inferred == "integer":
try:
- return cls._try_convert_to_int_index(
- subarr, copy, name, dtype)
+ return cls._try_convert_to_int_index(subarr, copy, name, dtype)
except ValueError:
pass
- return Index(subarr, copy=copy,
- dtype=object, name=name)
- elif inferred in ['floating', 'mixed-integer-float']:
+ return Index(subarr, copy=copy, dtype=object, name=name)
+ elif inferred in ["floating", "mixed-integer-float"]:
from .numeric import Float64Index
+
return Float64Index(subarr, copy=copy, name=name)
- elif inferred == 'interval':
+ elif inferred == "interval":
from .interval import IntervalIndex
+
try:
return IntervalIndex(subarr, name=name, copy=copy)
except ValueError:
# GH27172: mixed closed Intervals --> object dtype
pass
- elif inferred == 'boolean':
+ elif inferred == "boolean":
# don't support boolean explicitly ATM
pass
- elif inferred != 'string':
- if inferred.startswith('datetime'):
- if (lib.is_datetime_with_singletz_array(subarr) or
- 'tz' in kwargs):
+ elif inferred != "string":
+ if inferred.startswith("datetime"):
+ if (
+ lib.is_datetime_with_singletz_array(subarr)
+ or "tz" in kwargs
+ ):
# only when subarr has the same tz
from pandas import DatetimeIndex
+
try:
- return DatetimeIndex(subarr, copy=copy,
- name=name, **kwargs)
+ return DatetimeIndex(
+ subarr, copy=copy, name=name, **kwargs
+ )
except OutOfBoundsDatetime:
pass
- elif inferred.startswith('timedelta'):
+ elif inferred.startswith("timedelta"):
from pandas import TimedeltaIndex
- return TimedeltaIndex(subarr, copy=copy, name=name,
- **kwargs)
- elif inferred == 'period':
+
+ return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs)
+ elif inferred == "period":
try:
return PeriodIndex(subarr, name=name, **kwargs)
except IncompatibleFrequency:
pass
return cls._simple_new(subarr, name)
- elif hasattr(data, '__array__'):
- return Index(np.asarray(data), dtype=dtype, copy=copy, name=name,
- **kwargs)
+ elif hasattr(data, "__array__"):
+ return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs)
elif data is None or is_scalar(data):
cls._scalar_data_error(data)
else:
@@ -467,8 +536,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
# we must be all tuples, otherwise don't construct
# 10697
from .multi import MultiIndex
+
return MultiIndex.from_tuples(
- data, names=name or kwargs.get('names'))
+ data, names=name or kwargs.get("names")
+ )
# other iterable of some kind
subarr = com.asarray_tuplesafe(data, dtype=object)
return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs)
@@ -512,14 +583,15 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
Must be careful not to recurse.
"""
- if not hasattr(values, 'dtype'):
+ if not hasattr(values, "dtype"):
if (values is None or not len(values)) and dtype is not None:
values = np.empty(0, dtype=dtype)
else:
values = np.array(values, copy=False)
if is_object_dtype(values):
- values = cls(values, name=name, dtype=dtype,
- **kwargs)._ndarray_values
+ values = cls(
+ values, name=name, dtype=dtype, **kwargs
+ )._ndarray_values
if isinstance(values, (ABCSeries, ABCIndexClass)):
# Index._data must always be an ndarray.
@@ -553,7 +625,9 @@ def _get_attributes_dict(self):
"""
return {k: getattr(self, k, None) for k in self._attributes}
- _index_shared_docs['_shallow_copy'] = """
+ _index_shared_docs[
+ "_shallow_copy"
+ ] = """
Create a new Index with the same class as the caller, don't copy the
data, use the same object attributes with passed in attributes taking
precedence.
@@ -566,17 +640,17 @@ def _get_attributes_dict(self):
kwargs : updates the default attributes for this Index
"""
- @Appender(_index_shared_docs['_shallow_copy'])
+ @Appender(_index_shared_docs["_shallow_copy"])
def _shallow_copy(self, values=None, **kwargs):
if values is None:
values = self.values
attributes = self._get_attributes_dict()
attributes.update(kwargs)
- if not len(values) and 'dtype' not in kwargs:
- attributes['dtype'] = self.dtype
+ if not len(values) and "dtype" not in kwargs:
+ attributes["dtype"] = self.dtype
# _simple_new expects an the type of self._data
- values = getattr(values, '_values', values)
+ values = getattr(values, "_values", values)
if isinstance(values, ABCDatetimeArray):
# `self.values` returns `self` for tz-aware, so we need to unwrap
# more specifically
@@ -599,9 +673,9 @@ def _shallow_copy_with_infer(self, values, **kwargs):
"""
attributes = self._get_attributes_dict()
attributes.update(kwargs)
- attributes['copy'] = False
- if not len(values) and 'dtype' not in kwargs:
- attributes['dtype'] = self.dtype
+ attributes["copy"] = False
+ if not len(values) and "dtype" not in kwargs:
+ attributes["dtype"] = self.dtype
if self._infer_as_myclass:
try:
return self._constructor(values, **attributes)
@@ -630,8 +704,7 @@ def is_(self, other):
True if both have same underlying data, False otherwise : bool
"""
# use something other than None to be clearer
- return self._id is getattr(
- other, '_id', Ellipsis) and self._id is not None
+ return self._id is getattr(other, "_id", Ellipsis) and self._id is not None
def _reset_identity(self):
"""
@@ -690,12 +763,15 @@ def dtype_str(self):
.. deprecated:: 0.25.0
"""
- warnings.warn('`dtype_str` has been deprecated. Call `str` on the '
- 'dtype attribute instead.', FutureWarning,
- stacklevel=2)
+ warnings.warn(
+ "`dtype_str` has been deprecated. Call `str` on the "
+ "dtype attribute instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
return str(self.dtype)
- def ravel(self, order='C'):
+ def ravel(self, order="C"):
"""
Return an ndarray of the flattened values of the underlying data.
@@ -714,7 +790,7 @@ def view(self, cls=None):
# we need to see if we are subclassing an
# index type here
- if cls is not None and not hasattr(cls, '_typ'):
+ if cls is not None and not hasattr(cls, "_typ"):
result = self._data.view(cls)
else:
result = self._shallow_copy()
@@ -722,7 +798,9 @@ def view(self, cls=None):
result._id = self._id
return result
- _index_shared_docs['astype'] = """
+ _index_shared_docs[
+ "astype"
+ ] = """
Create an Index with values cast to dtypes. The class of a new Index
is determined by dtype. When conversion is impossible, a ValueError
exception is raised.
@@ -747,22 +825,22 @@ def view(self, cls=None):
Index with values cast to specified dtype.
"""
- @Appender(_index_shared_docs['astype'])
+ @Appender(_index_shared_docs["astype"])
def astype(self, dtype, copy=True):
if is_dtype_equal(self.dtype, dtype):
return self.copy() if copy else self
elif is_categorical_dtype(dtype):
from .category import CategoricalIndex
- return CategoricalIndex(self.values, name=self.name, dtype=dtype,
- copy=copy)
+
+ return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy)
elif is_datetime64tz_dtype(dtype):
# TODO(GH-24559): Remove this block, use the following elif.
# avoid FutureWarning from DatetimeIndex constructor.
from pandas import DatetimeIndex
+
tz = pandas_dtype(dtype).tz
- return (DatetimeIndex(np.asarray(self))
- .tz_localize("UTC").tz_convert(tz))
+ return DatetimeIndex(np.asarray(self)).tz_localize("UTC").tz_convert(tz)
elif is_extension_array_dtype(dtype):
return Index(np.asarray(self), dtype=dtype, copy=copy)
@@ -770,15 +848,20 @@ def astype(self, dtype, copy=True):
try:
if is_datetime64tz_dtype(dtype):
from pandas import DatetimeIndex
- return DatetimeIndex(self.values, name=self.name, dtype=dtype,
- copy=copy)
- return Index(self.values.astype(dtype, copy=copy), name=self.name,
- dtype=dtype)
+
+ return DatetimeIndex(
+ self.values, name=self.name, dtype=dtype, copy=copy
+ )
+ return Index(
+ self.values.astype(dtype, copy=copy), name=self.name, dtype=dtype
+ )
except (TypeError, ValueError):
- msg = 'Cannot cast {name} to dtype {dtype}'
+ msg = "Cannot cast {name} to dtype {dtype}"
raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
- _index_shared_docs['take'] = """
+ _index_shared_docs[
+ "take"
+ ] = """
Return a new %(klass)s of the values selected by the indices.
For internal compatibility with numpy arrays.
@@ -804,26 +887,29 @@ def astype(self, dtype, copy=True):
numpy.ndarray.take
"""
- @Appender(_index_shared_docs['take'] % _index_doc_kwargs)
- def take(self, indices, axis=0, allow_fill=True,
- fill_value=None, **kwargs):
+ @Appender(_index_shared_docs["take"] % _index_doc_kwargs)
+ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs):
if kwargs:
nv.validate_take(tuple(), kwargs)
indices = ensure_platform_int(indices)
if self._can_hold_na:
- taken = self._assert_take_fillable(self.values, indices,
- allow_fill=allow_fill,
- fill_value=fill_value,
- na_value=self._na_value)
+ taken = self._assert_take_fillable(
+ self.values,
+ indices,
+ allow_fill=allow_fill,
+ fill_value=fill_value,
+ na_value=self._na_value,
+ )
else:
if allow_fill and fill_value is not None:
- msg = 'Unable to fill values because {0} cannot contain NA'
+ msg = "Unable to fill values because {0} cannot contain NA"
raise ValueError(msg.format(self.__class__.__name__))
taken = self.values.take(indices)
return self._shallow_copy(taken)
- def _assert_take_fillable(self, values, indices, allow_fill=True,
- fill_value=None, na_value=np.nan):
+ def _assert_take_fillable(
+ self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan
+ ):
"""
Internal method to handle NA filling of take.
"""
@@ -832,18 +918,21 @@ def _assert_take_fillable(self, values, indices, allow_fill=True,
# only fill if we are passing a non-None fill_value
if allow_fill and fill_value is not None:
if (indices < -1).any():
- msg = ('When allow_fill=True and fill_value is not None, '
- 'all indices must be >= -1')
+ msg = (
+ "When allow_fill=True and fill_value is not None, "
+ "all indices must be >= -1"
+ )
raise ValueError(msg)
- taken = algos.take(values,
- indices,
- allow_fill=allow_fill,
- fill_value=na_value)
+ taken = algos.take(
+ values, indices, allow_fill=allow_fill, fill_value=na_value
+ )
else:
taken = values.take(indices)
return taken
- _index_shared_docs['repeat'] = """
+ _index_shared_docs[
+ "repeat"
+ ] = """
Repeat elements of a %(klass)s.
Returns a new %(klass)s where each element of the current %(klass)s
@@ -880,7 +969,7 @@ def _assert_take_fillable(self, values, indices, allow_fill=True,
Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object')
"""
- @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs)
def repeat(self, repeats, axis=None):
nv.validate_repeat(tuple(), dict(axis=axis))
return self._shallow_copy(self._values.repeat(repeats))
@@ -888,7 +977,9 @@ def repeat(self, repeats, axis=None):
# --------------------------------------------------------------------
# Copying Methods
- _index_shared_docs['copy'] = """
+ _index_shared_docs[
+ "copy"
+ ] = """
Make a copy of this object. Name and dtype sets those attributes on
the new object.
@@ -908,14 +999,14 @@ def repeat(self, repeats, axis=None):
``deep``, but if ``deep`` is passed it will attempt to deepcopy.
"""
- @Appender(_index_shared_docs['copy'])
+ @Appender(_index_shared_docs["copy"])
def copy(self, name=None, deep=False, dtype=None, **kwargs):
if deep:
new_index = self._shallow_copy(self._data.copy())
else:
new_index = self._shallow_copy()
- names = kwargs.get('names')
+ names = kwargs.get("names")
names = self._validate_names(name=name, names=names, deep=deep)
new_index = new_index.set_names(names)
@@ -949,12 +1040,11 @@ def __repr__(self):
attrs = self._format_attrs()
space = self._format_space()
- prepr = (",%s" %
- space).join("%s=%s" % (k, v) for k, v in attrs)
+ prepr = (",%s" % space).join("%s=%s" % (k, v) for k, v in attrs)
# no data provided, just attributes
if data is None:
- data = ''
+ data = ""
res = "%s(%s%s)" % (klass, data, prepr)
@@ -983,12 +1073,16 @@ def _format_data(self, name=None):
"""
# do we want to justify (only do so for non-objects)
- is_justify = not (self.inferred_type in ('string', 'unicode') or
- (self.inferred_type == 'categorical' and
- is_object_dtype(self.categories)))
+ is_justify = not (
+ self.inferred_type in ("string", "unicode")
+ or (
+ self.inferred_type == "categorical" and is_object_dtype(self.categories)
+ )
+ )
- return format_object_summary(self, self._formatter_func,
- is_justify=is_justify, name=name)
+ return format_object_summary(
+ self, self._formatter_func, is_justify=is_justify, name=name
+ )
def _format_attrs(self):
"""
@@ -1006,16 +1100,18 @@ def format(self, name=False, formatter=None, **kwargs):
"""
header = []
if name:
- header.append(pprint_thing(self.name,
- escape_chars=('\t', '\r', '\n')) if
- self.name is not None else '')
+ header.append(
+ pprint_thing(self.name, escape_chars=("\t", "\r", "\n"))
+ if self.name is not None
+ else ""
+ )
if formatter is not None:
return header + list(self.map(formatter))
return self._format_with_header(header, **kwargs)
- def _format_with_header(self, header, na_rep='NaN', **kwargs):
+ def _format_with_header(self, header, na_rep="NaN", **kwargs):
values = self.values
from pandas.io.formats.format import format_array
@@ -1027,8 +1123,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
values = lib.maybe_convert_objects(values, safe=1)
if is_object_dtype(values.dtype):
- result = [pprint_thing(x, escape_chars=('\t', '\r', '\n'))
- for x in values]
+ result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values]
# could have nans
mask = isna(values)
@@ -1038,7 +1133,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs):
result = result.tolist()
else:
- result = _trim_front(format_array(values, None, justify='left'))
+ result = _trim_front(format_array(values, None, justify="left"))
return header + result
def to_native_types(self, slicer=None, **kwargs):
@@ -1072,7 +1167,7 @@ def to_native_types(self, slicer=None, **kwargs):
values = values[slicer]
return values._format_native_types(**kwargs)
- def _format_native_types(self, na_rep='', quoting=None, **kwargs):
+ def _format_native_types(self, na_rep="", quoting=None, **kwargs):
"""
Actually format specific types of the index.
"""
@@ -1100,19 +1195,18 @@ def _summary(self, name=None):
"""
if len(self) > 0:
head = self[0]
- if hasattr(head, 'format') and not isinstance(head, str):
+ if hasattr(head, "format") and not isinstance(head, str):
head = head.format()
tail = self[-1]
- if hasattr(tail, 'format') and not isinstance(tail, str):
+ if hasattr(tail, "format") and not isinstance(tail, str):
tail = tail.format()
- index_summary = ', %s to %s' % (pprint_thing(head),
- pprint_thing(tail))
+ index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail))
else:
- index_summary = ''
+ index_summary = ""
if name is None:
name = type(self).__name__
- return '%s: %s entries%s' % (name, len(self), index_summary)
+ return "%s: %s entries%s" % (name, len(self), index_summary)
def summary(self, name=None):
"""
@@ -1120,8 +1214,11 @@ def summary(self, name=None):
.. deprecated:: 0.23.0
"""
- warnings.warn("'summary' is deprecated and will be removed in a "
- "future version.", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "'summary' is deprecated and will be removed in a " "future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._summary(name)
# --------------------------------------------------------------------
@@ -1227,6 +1324,7 @@ def to_frame(self, index=True, name=None):
"""
from pandas import DataFrame
+
if name is None:
name = self.name or 0
result = DataFrame({name: self._values.copy()})
@@ -1244,6 +1342,7 @@ def _validate_names(self, name=None, names=None, deep=False):
Index and plural 'names' parameter for MultiIndex.
"""
from copy import deepcopy
+
if names is not None and name is not None:
raise TypeError("Can only provide one of `names` and `name`")
elif names is None and name is None:
@@ -1258,7 +1357,7 @@ def _validate_names(self, name=None, names=None, deep=False):
return name
def _get_names(self):
- return FrozenList((self.name, ))
+ return FrozenList((self.name,))
def _set_names(self, values, level=None):
"""
@@ -1277,17 +1376,17 @@ def _set_names(self, values, level=None):
TypeError if each name is not hashable.
"""
if not is_list_like(values):
- raise ValueError('Names must be a list-like')
+ raise ValueError("Names must be a list-like")
if len(values) != 1:
- raise ValueError('Length of new names must be 1, got %d' %
- len(values))
+ raise ValueError("Length of new names must be 1, got %d" % len(values))
# GH 20527
# All items in 'name' need to be hashable:
for name in values:
if not is_hashable(name):
- raise TypeError('{}.name must be a hashable type'
- .format(self.__class__.__name__))
+ raise TypeError(
+ "{}.name must be a hashable type".format(self.__class__.__name__)
+ )
self.name = values[0]
names = property(fset=_set_names, fget=_get_names)
@@ -1350,10 +1449,9 @@ def set_names(self, names, level=None, inplace=False):
"""
if level is not None and not isinstance(self, ABCMultiIndex):
- raise ValueError('Level must be None for non-MultiIndex')
+ raise ValueError("Level must be None for non-MultiIndex")
- if level is not None and not is_list_like(level) and is_list_like(
- names):
+ if level is not None and not is_list_like(level) and is_list_like(names):
msg = "Names must be a string when a single level is provided."
raise TypeError(msg)
@@ -1450,15 +1548,16 @@ def _validate_index_level(self, level):
"""
if isinstance(level, int):
if level < 0 and level != -1:
- raise IndexError("Too many levels: Index has only 1 level,"
- " %d is not a valid level number" % (level, ))
+ raise IndexError(
+ "Too many levels: Index has only 1 level,"
+ " %d is not a valid level number" % (level,)
+ )
elif level > 0:
- raise IndexError("Too many levels:"
- " Index has only 1 level, not %d" %
- (level + 1))
+ raise IndexError(
+ "Too many levels:" " Index has only 1 level, not %d" % (level + 1)
+ )
elif level != self.name:
- raise KeyError('Level %s must be same as name (%s)' %
- (level, self.name))
+ raise KeyError("Level %s must be same as name (%s)" % (level, self.name))
def _get_level_number(self, level):
self._validate_index_level(level)
@@ -1552,9 +1651,11 @@ def droplevel(self, level=0):
if len(level) == 0:
return self
if len(level) >= self.nlevels:
- raise ValueError("Cannot remove {} levels from an index with {} "
- "levels: at least one level must be "
- "left.".format(len(level), self.nlevels))
+ raise ValueError(
+ "Cannot remove {} levels from an index with {} "
+ "levels: at least one level must be "
+ "left.".format(len(level), self.nlevels)
+ )
# The two checks above guarantee that here self is a MultiIndex
new_levels = list(self.levels)
@@ -1578,10 +1679,17 @@ def droplevel(self, level=0):
return result
else:
from .multi import MultiIndex
- return MultiIndex(levels=new_levels, codes=new_codes,
- names=new_names, verify_integrity=False)
- _index_shared_docs['_get_grouper_for_level'] = """
+ return MultiIndex(
+ levels=new_levels,
+ codes=new_codes,
+ names=new_names,
+ verify_integrity=False,
+ )
+
+ _index_shared_docs[
+ "_get_grouper_for_level"
+ ] = """
Get index grouper corresponding to an index level
Parameters
@@ -1601,7 +1709,7 @@ def droplevel(self, level=0):
Index of unique values for level.
"""
- @Appender(_index_shared_docs['_get_grouper_for_level'])
+ @Appender(_index_shared_docs["_get_grouper_for_level"])
def _get_grouper_for_level(self, mapper, level=None):
assert level is None or level == 0
if mapper is None:
@@ -1704,16 +1812,16 @@ def has_duplicates(self):
return not self.is_unique
def is_boolean(self):
- return self.inferred_type in ['boolean']
+ return self.inferred_type in ["boolean"]
def is_integer(self):
- return self.inferred_type in ['integer']
+ return self.inferred_type in ["integer"]
def is_floating(self):
- return self.inferred_type in ['floating', 'mixed-integer-float']
+ return self.inferred_type in ["floating", "mixed-integer-float"]
def is_numeric(self):
- return self.inferred_type in ['integer', 'floating']
+ return self.inferred_type in ["integer", "floating"]
def is_object(self):
return is_object_dtype(self.dtype)
@@ -1752,19 +1860,19 @@ def is_categorical(self):
>>> s.index.is_categorical()
False
"""
- return self.inferred_type in ['categorical']
+ return self.inferred_type in ["categorical"]
def is_interval(self):
- return self.inferred_type in ['interval']
+ return self.inferred_type in ["interval"]
def is_mixed(self):
- return self.inferred_type in ['mixed']
+ return self.inferred_type in ["mixed"]
def holds_integer(self):
"""
Whether the type is an integer type.
"""
- return self.inferred_type in ['integer', 'mixed-integer']
+ return self.inferred_type in ["integer", "mixed-integer"]
@cache_readonly
def inferred_type(self):
@@ -1793,7 +1901,7 @@ def __setstate__(self, state):
"""
if isinstance(state, dict):
- self._data = state.pop('data')
+ self._data = state.pop("data")
for k, v in state.items():
setattr(self, k, v)
@@ -1909,6 +2017,7 @@ def isna(self):
array([False, True, True, True], dtype=bool)
"""
return self._isnan
+
isnull = isna
def notna(self):
@@ -1956,9 +2065,12 @@ def notna(self):
array([ True, True, True, False])
"""
return ~self.isna()
+
notnull = notna
- _index_shared_docs['fillna'] = """
+ _index_shared_docs[
+ "fillna"
+ ] = """
Fill NA/NaN values with the specified value
Parameters
@@ -1976,7 +2088,7 @@ def notna(self):
filled : Index
"""
- @Appender(_index_shared_docs['fillna'])
+ @Appender(_index_shared_docs["fillna"])
def fillna(self, value=None, downcast=None):
self._assert_can_do_op(value)
if self.hasnans:
@@ -1987,7 +2099,9 @@ def fillna(self, value=None, downcast=None):
return Index(result, name=self.name)
return self._shallow_copy()
- _index_shared_docs['dropna'] = """
+ _index_shared_docs[
+ "dropna"
+ ] = """
Return Index without NA/NaN values
Parameters
@@ -2001,9 +2115,9 @@ def fillna(self, value=None, downcast=None):
valid : Index
"""
- @Appender(_index_shared_docs['dropna'])
- def dropna(self, how='any'):
- if how not in ('any', 'all'):
+ @Appender(_index_shared_docs["dropna"])
+ def dropna(self, how="any"):
+ if how not in ("any", "all"):
raise ValueError("invalid how option: {0}".format(how))
if self.hasnans:
@@ -2013,8 +2127,9 @@ def dropna(self, how='any'):
# --------------------------------------------------------------------
# Uniqueness Methods
- _index_shared_docs['index_unique'] = (
- """
+ _index_shared_docs[
+ "index_unique"
+ ] = """
Return unique values in the index. Uniques are returned in order
of appearance, this does NOT sort.
@@ -2033,16 +2148,16 @@ def dropna(self, how='any'):
--------
unique
Series.unique
- """)
+ """
- @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs)
def unique(self, level=None):
if level is not None:
self._validate_index_level(level)
result = super().unique()
return self._shallow_copy(result)
- def drop_duplicates(self, keep='first'):
+ def drop_duplicates(self, keep="first"):
"""
Return Index with duplicate values removed.
@@ -2090,7 +2205,7 @@ def drop_duplicates(self, keep='first'):
"""
return super().drop_duplicates(keep=keep)
- def duplicated(self, keep='first'):
+ def duplicated(self, keep="first"):
"""
Indicate duplicate index values.
@@ -2198,10 +2313,13 @@ def get_duplicates(self):
>>> pd.Index(dates).get_duplicates() # doctest: +SKIP
DatetimeIndex([], dtype='datetime64[ns]', freq=None)
"""
- warnings.warn("'get_duplicates' is deprecated and will be removed in "
- "a future release. You can use "
- "idx[idx.duplicated()].unique() instead",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "'get_duplicates' is deprecated and will be removed in "
+ "a future release. You can use "
+ "idx[idx.duplicated()].unique() instead",
+ FutureWarning,
+ stacklevel=2,
+ )
return self[self.duplicated()].unique()
@@ -2266,9 +2384,12 @@ def __xor__(self, other):
return self.symmetric_difference(other)
def __nonzero__(self):
- raise ValueError("The truth value of a {0} is ambiguous. "
- "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
- .format(self.__class__.__name__))
+ raise ValueError(
+ "The truth value of a {0} is ambiguous. "
+ "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format(
+ self.__class__.__name__
+ )
+ )
__bool__ = __nonzero__
@@ -2324,13 +2445,14 @@ def _is_compatible_with_other(self, other):
-------
bool
"""
- return (type(self) is type(other)
- and is_dtype_equal(self.dtype, other.dtype))
+ return type(self) is type(other) and is_dtype_equal(self.dtype, other.dtype)
def _validate_sort_keyword(self, sort):
if sort not in [None, False]:
- raise ValueError("The 'sort' keyword only takes the values of "
- "None or False; {0} was passed.".format(sort))
+ raise ValueError(
+ "The 'sort' keyword only takes the values of "
+ "None or False; {0} was passed.".format(sort)
+ )
def union(self, other, sort=None):
"""
@@ -2443,8 +2565,7 @@ def _union(self, other, sort):
indexer, = (indexer == -1).nonzero()
if len(indexer) > 0:
- other_diff = algos.take_nd(rvals, indexer,
- allow_fill=False)
+ other_diff = algos.take_nd(rvals, indexer, allow_fill=False)
result = _concat._concat_compat((lvals, other_diff))
else:
@@ -2454,9 +2575,12 @@ def _union(self, other, sort):
try:
result = sorting.safe_sort(result)
except TypeError as e:
- warnings.warn("{}, sort order is undefined for "
- "incomparable objects".format(e),
- RuntimeWarning, stacklevel=3)
+ warnings.warn(
+ "{}, sort order is undefined for "
+ "incomparable objects".format(e),
+ RuntimeWarning,
+ stacklevel=3,
+ )
# for subclasses
return self._wrap_setop_result(other, result)
@@ -2464,7 +2588,9 @@ def _union(self, other, sort):
def _wrap_setop_result(self, other, result):
return self._constructor(result, name=get_op_result_name(self, other))
- _index_shared_docs['intersection'] = """
+ _index_shared_docs[
+ "intersection"
+ ] = """
Form the intersection of two Index objects.
This returns a new Index with elements common to the index and `other`.
@@ -2500,7 +2626,7 @@ def _wrap_setop_result(self, other, result):
"""
# TODO: standardize return type of non-union setops type(self vs other)
- @Appender(_index_shared_docs['intersection'])
+ @Appender(_index_shared_docs["intersection"])
def intersection(self, other, sort=False):
self._validate_sort_keyword(sort)
self._assert_can_do_setop(other)
@@ -2510,8 +2636,8 @@ def intersection(self, other, sort=False):
return self._get_reconciled_name_object(other)
if not is_dtype_equal(self.dtype, other.dtype):
- this = self.astype('O')
- other = other.astype('O')
+ this = self.astype("O")
+ other = other.astype("O")
return this.intersection(other, sort=sort)
# TODO(EA): setops-refactor, clean all this up
@@ -2536,8 +2662,7 @@ def intersection(self, other, sort=False):
indexer = indexer.take((indexer != -1).nonzero()[0])
except Exception:
# duplicates
- indexer = algos.unique1d(
- Index(rvals).get_indexer_non_unique(lvals)[0])
+ indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0])
indexer = indexer[indexer != -1]
taken = other.take(indexer)
@@ -2609,8 +2734,7 @@ def difference(self, other, sort=None):
indexer = this.get_indexer(other)
indexer = indexer.take((indexer != -1).nonzero()[0])
- label_diff = np.setdiff1d(np.arange(this.size), indexer,
- assume_unique=True)
+ label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True)
the_diff = this.values.take(label_diff)
if sort is None:
try:
@@ -2679,8 +2803,9 @@ def symmetric_difference(self, other, result_name=None, sort=None):
# {this} minus {other}
common_indexer = indexer.take((indexer != -1).nonzero()[0])
- left_indexer = np.setdiff1d(np.arange(this.size), common_indexer,
- assume_unique=True)
+ left_indexer = np.setdiff1d(
+ np.arange(this.size), common_indexer, assume_unique=True
+ )
left_diff = this.values.take(left_indexer)
# {other} minus {this}
@@ -2695,14 +2820,14 @@ def symmetric_difference(self, other, result_name=None, sort=None):
pass
attribs = self._get_attributes_dict()
- attribs['name'] = result_name
- if 'freq' in attribs:
- attribs['freq'] = None
+ attribs["name"] = result_name
+ if "freq" in attribs:
+ attribs["freq"] = None
return self._shallow_copy_with_infer(the_diff, **attribs)
def _assert_can_do_setop(self, other):
if not is_list_like(other):
- raise TypeError('Input must be Index or array-like')
+ raise TypeError("Input must be Index or array-like")
return True
def _convert_can_do_setop(self, other):
@@ -2716,7 +2841,9 @@ def _convert_can_do_setop(self, other):
# --------------------------------------------------------------------
# Indexing Methods
- _index_shared_docs['get_loc'] = """
+ _index_shared_docs[
+ "get_loc"
+ ] = """
Get integer location, slice or boolean mask for requested label.
Parameters
@@ -2754,25 +2881,29 @@ def _convert_can_do_setop(self, other):
array([False, True, False, True], dtype=bool)
"""
- @Appender(_index_shared_docs['get_loc'])
+ @Appender(_index_shared_docs["get_loc"])
def get_loc(self, key, method=None, tolerance=None):
if method is None:
if tolerance is not None:
- raise ValueError('tolerance argument only valid if using pad, '
- 'backfill or nearest lookups')
+ raise ValueError(
+ "tolerance argument only valid if using pad, "
+ "backfill or nearest lookups"
+ )
try:
return self._engine.get_loc(key)
except KeyError:
return self._engine.get_loc(self._maybe_cast_indexer(key))
indexer = self.get_indexer([key], method=method, tolerance=tolerance)
if indexer.ndim > 1 or indexer.size > 1:
- raise TypeError('get_loc requires scalar valued input')
+ raise TypeError("get_loc requires scalar valued input")
loc = indexer.item()
if loc == -1:
raise KeyError(key)
return loc
- _index_shared_docs['get_indexer'] = """
+ _index_shared_docs[
+ "get_indexer"
+ ] = """
Compute indexer and mask for new index given the current index. The
indexer should be then used as an input to ndarray.take to align the
current data to the new index.
@@ -2819,7 +2950,7 @@ def get_loc(self, key, method=None, tolerance=None):
and ``x`` is marked by -1, as it is not in ``index``.
"""
- @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def get_indexer(self, target, method=None, limit=None, tolerance=None):
method = missing.clean_reindex_fill_method(method)
target = ensure_index(target)
@@ -2834,30 +2965,37 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
pself, ptarget = self._maybe_promote(target)
if pself is not self or ptarget is not target:
- return pself.get_indexer(ptarget, method=method, limit=limit,
- tolerance=tolerance)
+ return pself.get_indexer(
+ ptarget, method=method, limit=limit, tolerance=tolerance
+ )
if not is_dtype_equal(self.dtype, target.dtype):
this = self.astype(object)
target = target.astype(object)
- return this.get_indexer(target, method=method, limit=limit,
- tolerance=tolerance)
+ return this.get_indexer(
+ target, method=method, limit=limit, tolerance=tolerance
+ )
if not self.is_unique:
- raise InvalidIndexError('Reindexing only valid with uniquely'
- ' valued Index objects')
+ raise InvalidIndexError(
+ "Reindexing only valid with uniquely" " valued Index objects"
+ )
- if method == 'pad' or method == 'backfill':
+ if method == "pad" or method == "backfill":
indexer = self._get_fill_indexer(target, method, limit, tolerance)
- elif method == 'nearest':
+ elif method == "nearest":
indexer = self._get_nearest_indexer(target, limit, tolerance)
else:
if tolerance is not None:
- raise ValueError('tolerance argument only valid if doing pad, '
- 'backfill or nearest reindexing')
+ raise ValueError(
+ "tolerance argument only valid if doing pad, "
+ "backfill or nearest reindexing"
+ )
if limit is not None:
- raise ValueError('limit argument only valid if doing pad, '
- 'backfill or nearest reindexing')
+ raise ValueError(
+ "limit argument only valid if doing pad, "
+ "backfill or nearest reindexing"
+ )
indexer = self._engine.get_indexer(target._ndarray_values)
@@ -2867,22 +3005,23 @@ def _convert_tolerance(self, tolerance, target):
# override this method on subclasses
tolerance = np.asarray(tolerance)
if target.size != tolerance.size and tolerance.size > 1:
- raise ValueError('list-like tolerance size must match '
- 'target index size')
+ raise ValueError("list-like tolerance size must match " "target index size")
return tolerance
def _get_fill_indexer(self, target, method, limit=None, tolerance=None):
if self.is_monotonic_increasing and target.is_monotonic_increasing:
- method = (self._engine.get_pad_indexer if method == 'pad' else
- self._engine.get_backfill_indexer)
+ method = (
+ self._engine.get_pad_indexer
+ if method == "pad"
+ else self._engine.get_backfill_indexer
+ )
indexer = method(target._ndarray_values, limit)
else:
- indexer = self._get_fill_indexer_searchsorted(target, method,
- limit)
+ indexer = self._get_fill_indexer_searchsorted(target, method, limit)
if tolerance is not None:
- indexer = self._filter_indexer_tolerance(target._ndarray_values,
- indexer,
- tolerance)
+ indexer = self._filter_indexer_tolerance(
+ target._ndarray_values, indexer, tolerance
+ )
return indexer
def _get_fill_indexer_searchsorted(self, target, method, limit=None):
@@ -2891,17 +3030,18 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None):
indexes and non-monotonic targets.
"""
if limit is not None:
- raise ValueError('limit argument for %r method only well-defined '
- 'if index and target are monotonic' % method)
+ raise ValueError(
+ "limit argument for %r method only well-defined "
+ "if index and target are monotonic" % method
+ )
- side = 'left' if method == 'pad' else 'right'
+ side = "left" if method == "pad" else "right"
# find exact matches first (this simplifies the algorithm)
indexer = self.get_indexer(target)
- nonexact = (indexer == -1)
- indexer[nonexact] = self._searchsorted_monotonic(target[nonexact],
- side)
- if side == 'left':
+ nonexact = indexer == -1
+ indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side)
+ if side == "left":
# searchsorted returns "indices into a sorted array such that,
# if the corresponding elements in v were inserted before the
# indices, the order of a would be preserved".
@@ -2921,19 +3061,21 @@ def _get_nearest_indexer(self, target, limit, tolerance):
values that can be subtracted from each other (e.g., not strings or
tuples).
"""
- left_indexer = self.get_indexer(target, 'pad', limit=limit)
- right_indexer = self.get_indexer(target, 'backfill', limit=limit)
+ left_indexer = self.get_indexer(target, "pad", limit=limit)
+ right_indexer = self.get_indexer(target, "backfill", limit=limit)
target = np.asarray(target)
left_distances = abs(self.values[left_indexer] - target)
right_distances = abs(self.values[right_indexer] - target)
op = operator.lt if self.is_monotonic_increasing else operator.le
- indexer = np.where(op(left_distances, right_distances) |
- (right_indexer == -1), left_indexer, right_indexer)
+ indexer = np.where(
+ op(left_distances, right_distances) | (right_indexer == -1),
+ left_indexer,
+ right_indexer,
+ )
if tolerance is not None:
- indexer = self._filter_indexer_tolerance(target, indexer,
- tolerance)
+ indexer = self._filter_indexer_tolerance(target, indexer, tolerance)
return indexer
def _filter_indexer_tolerance(self, target, indexer, tolerance):
@@ -2944,7 +3086,9 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance):
# --------------------------------------------------------------------
# Indexer Conversion Methods
- _index_shared_docs['_convert_scalar_indexer'] = """
+ _index_shared_docs[
+ "_convert_scalar_indexer"
+ ] = """
Convert a scalar indexer.
Parameters
@@ -2953,43 +3097,47 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance):
kind : {'ix', 'loc', 'getitem', 'iloc'} or None
"""
- @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ @Appender(_index_shared_docs["_convert_scalar_indexer"])
def _convert_scalar_indexer(self, key, kind=None):
- assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+ assert kind in ["ix", "loc", "getitem", "iloc", None]
- if kind == 'iloc':
- return self._validate_indexer('positional', key, kind)
+ if kind == "iloc":
+ return self._validate_indexer("positional", key, kind)
- if len(self) and not isinstance(self, ABCMultiIndex,):
+ if len(self) and not isinstance(self, ABCMultiIndex):
# we can raise here if we are definitive that this
# is positional indexing (eg. .ix on with a float)
# or label indexing if we are using a type able
# to be represented in the index
- if kind in ['getitem', 'ix'] and is_float(key):
+ if kind in ["getitem", "ix"] and is_float(key):
if not self.is_floating():
- return self._invalid_indexer('label', key)
+ return self._invalid_indexer("label", key)
- elif kind in ['loc'] and is_float(key):
+ elif kind in ["loc"] and is_float(key):
# we want to raise KeyError on string/mixed here
# technically we *could* raise a TypeError
# on anything but mixed though
- if self.inferred_type not in ['floating',
- 'mixed-integer-float',
- 'string',
- 'unicode',
- 'mixed']:
- return self._invalid_indexer('label', key)
-
- elif kind in ['loc'] and is_integer(key):
+ if self.inferred_type not in [
+ "floating",
+ "mixed-integer-float",
+ "string",
+ "unicode",
+ "mixed",
+ ]:
+ return self._invalid_indexer("label", key)
+
+ elif kind in ["loc"] and is_integer(key):
if not self.holds_integer():
- return self._invalid_indexer('label', key)
+ return self._invalid_indexer("label", key)
return key
- _index_shared_docs['_convert_slice_indexer'] = """
+ _index_shared_docs[
+ "_convert_slice_indexer"
+ ] = """
Convert a slice indexer.
By definition, these are labels unless 'iloc' is passed in.
@@ -3001,19 +3149,21 @@ def _convert_scalar_indexer(self, key, kind=None):
kind : {'ix', 'loc', 'getitem', 'iloc'} or None
"""
- @Appender(_index_shared_docs['_convert_slice_indexer'])
+ @Appender(_index_shared_docs["_convert_slice_indexer"])
def _convert_slice_indexer(self, key, kind=None):
- assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+ assert kind in ["ix", "loc", "getitem", "iloc", None]
# if we are not a slice, then we are done
if not isinstance(key, slice):
return key
# validate iloc
- if kind == 'iloc':
- return slice(self._validate_indexer('slice', key.start, kind),
- self._validate_indexer('slice', key.stop, kind),
- self._validate_indexer('slice', key.step, kind))
+ if kind == "iloc":
+ return slice(
+ self._validate_indexer("slice", key.start, kind),
+ self._validate_indexer("slice", key.stop, kind),
+ self._validate_indexer("slice", key.step, kind),
+ )
# potentially cast the bounds to integers
start, stop, step = key.start, key.stop, key.step
@@ -3026,15 +3176,17 @@ def is_int(v):
is_index_slice = is_int(start) and is_int(stop)
is_positional = is_index_slice and not self.is_integer()
- if kind == 'getitem':
+ if kind == "getitem":
"""
called from the getitem slicers, validate that we are in fact
integers
"""
if self.is_integer() or is_index_slice:
- return slice(self._validate_indexer('slice', key.start, kind),
- self._validate_indexer('slice', key.stop, kind),
- self._validate_indexer('slice', key.step, kind))
+ return slice(
+ self._validate_indexer("slice", key.start, kind),
+ self._validate_indexer("slice", key.stop, kind),
+ self._validate_indexer("slice", key.step, kind),
+ )
# convert the slice to an indexer here
@@ -3048,7 +3200,7 @@ def is_int(v):
self.get_loc(stop)
is_positional = False
except KeyError:
- if self.inferred_type == 'mixed-integer-float':
+ if self.inferred_type == "mixed-integer-float":
raise
if is_null_slicer:
@@ -3091,7 +3243,9 @@ def _convert_listlike_indexer(self, keyarr, kind=None):
indexer = self._convert_list_indexer(keyarr, kind=kind)
return indexer, keyarr
- _index_shared_docs['_convert_arr_indexer'] = """
+ _index_shared_docs[
+ "_convert_arr_indexer"
+ ] = """
Convert an array-like indexer to the appropriate dtype.
Parameters
@@ -3104,12 +3258,14 @@ def _convert_listlike_indexer(self, keyarr, kind=None):
converted_keyarr : array-like
"""
- @Appender(_index_shared_docs['_convert_arr_indexer'])
+ @Appender(_index_shared_docs["_convert_arr_indexer"])
def _convert_arr_indexer(self, keyarr):
keyarr = com.asarray_tuplesafe(keyarr)
return keyarr
- _index_shared_docs['_convert_index_indexer'] = """
+ _index_shared_docs[
+ "_convert_index_indexer"
+ ] = """
Convert an Index indexer to the appropriate dtype.
Parameters
@@ -3122,11 +3278,13 @@ def _convert_arr_indexer(self, keyarr):
converted_keyarr : Index (or sub-class)
"""
- @Appender(_index_shared_docs['_convert_index_indexer'])
+ @Appender(_index_shared_docs["_convert_index_indexer"])
def _convert_index_indexer(self, keyarr):
return keyarr
- _index_shared_docs['_convert_list_indexer'] = """
+ _index_shared_docs[
+ "_convert_list_indexer"
+ ] = """
Convert a list-like indexer to the appropriate dtype.
Parameters
@@ -3140,13 +3298,16 @@ def _convert_index_indexer(self, keyarr):
positional indexer or None
"""
- @Appender(_index_shared_docs['_convert_list_indexer'])
+ @Appender(_index_shared_docs["_convert_list_indexer"])
def _convert_list_indexer(self, keyarr, kind=None):
- if (kind in [None, 'iloc', 'ix'] and
- is_integer_dtype(keyarr) and not self.is_floating() and
- not isinstance(keyarr, ABCPeriodIndex)):
-
- if self.inferred_type == 'mixed-integer':
+ if (
+ kind in [None, "iloc", "ix"]
+ and is_integer_dtype(keyarr)
+ and not self.is_floating()
+ and not isinstance(keyarr, ABCPeriodIndex)
+ ):
+
+ if self.inferred_type == "mixed-integer":
indexer = self.get_indexer(keyarr)
if (indexer >= 0).all():
return indexer
@@ -3157,9 +3318,10 @@ def _convert_list_indexer(self, keyarr, kind=None):
# IndexError in maybe_convert_indices
indexer[indexer < 0] = len(self)
from pandas.core.indexing import maybe_convert_indices
+
return maybe_convert_indices(indexer, len(self))
- elif not self.inferred_type == 'integer':
+ elif not self.inferred_type == "integer":
keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr)
return keyarr
@@ -3169,10 +3331,12 @@ def _invalid_indexer(self, form, key):
"""
Consistent invalid indexer message.
"""
- raise TypeError("cannot do {form} indexing on {klass} with these "
- "indexers [{key}] of {kind}".format(
- form=form, klass=type(self), key=key,
- kind=type(key)))
+ raise TypeError(
+ "cannot do {form} indexing on {klass} with these "
+ "indexers [{key}] of {kind}".format(
+ form=form, klass=type(self), key=key, kind=type(key)
+ )
+ )
# --------------------------------------------------------------------
# Reindex Methods
@@ -3194,8 +3358,7 @@ def _can_reindex(self, indexer):
if not self.is_unique and len(indexer):
raise ValueError("cannot reindex from a duplicate axis")
- def reindex(self, target, method=None, level=None, limit=None,
- tolerance=None):
+ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
"""
Create index with target's values (move/add/delete values
as necessary).
@@ -3213,14 +3376,14 @@ def reindex(self, target, method=None, level=None, limit=None,
"""
# GH6552: preserve names when reindexing to non-named target
# (i.e. neither Index nor Series).
- preserve_names = not hasattr(target, 'name')
+ preserve_names = not hasattr(target, "name")
# GH7774: preserve dtype/tz if target is empty and not an Index.
target = _ensure_has_len(target) # target may be an iterator
if not isinstance(target, Index) and len(target) == 0:
attrs = self._get_attributes_dict()
- attrs.pop('freq', None) # don't preserve freq
+ attrs.pop("freq", None) # don't preserve freq
values = self._data[:0] # appropriately-dtyped empty array
target = self._simple_new(values, dtype=self.dtype, **attrs)
else:
@@ -3228,23 +3391,25 @@ def reindex(self, target, method=None, level=None, limit=None,
if level is not None:
if method is not None:
- raise TypeError('Fill method not supported if level passed')
- _, indexer, _ = self._join_level(target, level, how='right',
- return_indexers=True)
+ raise TypeError("Fill method not supported if level passed")
+ _, indexer, _ = self._join_level(
+ target, level, how="right", return_indexers=True
+ )
else:
if self.equals(target):
indexer = None
else:
# check is_overlapping for IntervalIndex compat
- if (self.is_unique and
- not getattr(self, 'is_overlapping', False)):
- indexer = self.get_indexer(target, method=method,
- limit=limit,
- tolerance=tolerance)
+ if self.is_unique and not getattr(self, "is_overlapping", False):
+ indexer = self.get_indexer(
+ target, method=method, limit=limit, tolerance=tolerance
+ )
else:
if method is not None or limit is not None:
- raise ValueError("cannot reindex a non-unique index "
- "with a method or limit")
+ raise ValueError(
+ "cannot reindex a non-unique index "
+ "with a method or limit"
+ )
indexer, missing = self.get_indexer_non_unique(target)
if preserve_names and target.nlevels == 1 and target.name != self.name:
@@ -3315,7 +3480,9 @@ def _reindex_non_unique(self, target):
# --------------------------------------------------------------------
# Join Methods
- _index_shared_docs['join'] = """
+ _index_shared_docs[
+ "join"
+ ] = """
Compute join_index and indexers to conform data
structures to the new index.
@@ -3336,9 +3503,8 @@ def _reindex_non_unique(self, target):
join_index, (left_indexer, right_indexer)
"""
- @Appender(_index_shared_docs['join'])
- def join(self, other, how='left', level=None, return_indexers=False,
- sort=False):
+ @Appender(_index_shared_docs["join"])
+ def join(self, other, how="left", level=None, return_indexers=False, sort=False):
self_is_mi = isinstance(self, ABCMultiIndex)
other_is_mi = isinstance(other, ABCMultiIndex)
@@ -3350,17 +3516,17 @@ def join(self, other, how='left', level=None, return_indexers=False,
if self.names == other.names:
pass
else:
- return self._join_multi(other, how=how,
- return_indexers=return_indexers)
+ return self._join_multi(other, how=how, return_indexers=return_indexers)
# join on the level
if level is not None and (self_is_mi or other_is_mi):
- return self._join_level(other, level, how=how,
- return_indexers=return_indexers)
+ return self._join_level(
+ other, level, how=how, return_indexers=return_indexers
+ )
other = ensure_index(other)
- if len(other) == 0 and how in ('left', 'outer'):
+ if len(other) == 0 and how in ("left", "outer"):
join_index = self._shallow_copy()
if return_indexers:
rindexer = np.repeat(-1, len(join_index))
@@ -3368,7 +3534,7 @@ def join(self, other, how='left', level=None, return_indexers=False,
else:
return join_index
- if len(self) == 0 and how in ('right', 'outer'):
+ if len(self) == 0 and how in ("right", "outer"):
join_index = other._shallow_copy()
if return_indexers:
lindexer = np.repeat(-1, len(join_index))
@@ -3377,47 +3543,52 @@ def join(self, other, how='left', level=None, return_indexers=False,
return join_index
if self._join_precedence < other._join_precedence:
- how = {'right': 'left', 'left': 'right'}.get(how, how)
- result = other.join(self, how=how, level=level,
- return_indexers=return_indexers)
+ how = {"right": "left", "left": "right"}.get(how, how)
+ result = other.join(
+ self, how=how, level=level, return_indexers=return_indexers
+ )
if return_indexers:
x, y, z = result
result = x, z, y
return result
if not is_dtype_equal(self.dtype, other.dtype):
- this = self.astype('O')
- other = other.astype('O')
+ this = self.astype("O")
+ other = other.astype("O")
return this.join(other, how=how, return_indexers=return_indexers)
_validate_join_method(how)
if not self.is_unique and not other.is_unique:
- return self._join_non_unique(other, how=how,
- return_indexers=return_indexers)
+ return self._join_non_unique(
+ other, how=how, return_indexers=return_indexers
+ )
elif not self.is_unique or not other.is_unique:
if self.is_monotonic and other.is_monotonic:
- return self._join_monotonic(other, how=how,
- return_indexers=return_indexers)
+ return self._join_monotonic(
+ other, how=how, return_indexers=return_indexers
+ )
else:
- return self._join_non_unique(other, how=how,
- return_indexers=return_indexers)
+ return self._join_non_unique(
+ other, how=how, return_indexers=return_indexers
+ )
elif self.is_monotonic and other.is_monotonic:
try:
- return self._join_monotonic(other, how=how,
- return_indexers=return_indexers)
+ return self._join_monotonic(
+ other, how=how, return_indexers=return_indexers
+ )
except TypeError:
pass
- if how == 'left':
+ if how == "left":
join_index = self
- elif how == 'right':
+ elif how == "right":
join_index = other
- elif how == 'inner':
+ elif how == "inner":
# TODO: sort=False here for backwards compat. It may
# be better to use the sort parameter passed into join
join_index = self.intersection(other, sort=False)
- elif how == 'outer':
+ elif how == "outer":
# TODO: sort=True here for backwards compat. It may
# be better to use the sort parameter passed into join
join_index = self.union(other)
@@ -3465,23 +3636,23 @@ def _join_multi(self, other, how, return_indexers=True):
# Join left and right
# Join on same leveled multi-index frames is supported
- join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how,
- return_indexers=True)
+ join_idx, lidx, ridx = self_jnlevels.join(
+ other_jnlevels, how, return_indexers=True
+ )
# Restore the dropped levels
# Returned index level order is
# common levels, ldrop_names, rdrop_names
dropped_names = ldrop_names + rdrop_names
- levels, codes, names = (
- _restore_dropped_levels_multijoin(self, other,
- dropped_names,
- join_idx,
- lidx, ridx))
+ levels, codes, names = _restore_dropped_levels_multijoin(
+ self, other, dropped_names, join_idx, lidx, ridx
+ )
# Re-create the multi-index
- multi_join_idx = MultiIndex(levels=levels, codes=codes,
- names=names, verify_integrity=False)
+ multi_join_idx = MultiIndex(
+ levels=levels, codes=codes, names=names, verify_integrity=False
+ )
multi_join_idx = multi_join_idx.remove_unused_levels()
@@ -3496,24 +3667,24 @@ def _join_multi(self, other, how, return_indexers=True):
self, other = other, self
flip_order = True
# flip if join method is right or left
- how = {'right': 'left', 'left': 'right'}.get(how, how)
+ how = {"right": "left", "left": "right"}.get(how, how)
level = other.names.index(jl)
- result = self._join_level(other, level, how=how,
- return_indexers=return_indexers)
+ result = self._join_level(
+ other, level, how=how, return_indexers=return_indexers
+ )
if flip_order:
if isinstance(result, tuple):
return result[0], result[2], result[1]
return result
- def _join_non_unique(self, other, how='left', return_indexers=False):
+ def _join_non_unique(self, other, how="left", return_indexers=False):
from pandas.core.reshape.merge import _get_join_indexers
- left_idx, right_idx = _get_join_indexers([self._ndarray_values],
- [other._ndarray_values],
- how=how,
- sort=True)
+ left_idx, right_idx = _get_join_indexers(
+ [self._ndarray_values], [other._ndarray_values], how=how, sort=True
+ )
left_idx = ensure_platform_int(left_idx)
right_idx = ensure_platform_int(right_idx)
@@ -3529,8 +3700,9 @@ def _join_non_unique(self, other, how='left', return_indexers=False):
else:
return join_index
- def _join_level(self, other, level, how='left', return_indexers=False,
- keep_order=True):
+ def _join_level(
+ self, other, level, how="left", return_indexers=False, keep_order=True
+ ):
"""
The join method *only* affects the level of the resulting
MultiIndex. Otherwise it just exactly aligns the Index data to the
@@ -3548,7 +3720,7 @@ def _get_leaf_sorter(labels):
order of higher levels.
"""
if labels[0].size == 0:
- return np.empty(0, dtype='int64')
+ return np.empty(0, dtype="int64")
if len(labels) == 1:
lab = ensure_int64(labels[0])
@@ -3566,41 +3738,44 @@ def _get_leaf_sorter(labels):
return lib.get_level_sorter(lab, ensure_int64(starts))
if isinstance(self, MultiIndex) and isinstance(other, MultiIndex):
- raise TypeError('Join on level between two MultiIndex objects '
- 'is ambiguous')
+ raise TypeError(
+ "Join on level between two MultiIndex objects " "is ambiguous"
+ )
left, right = self, other
flip_order = not isinstance(self, MultiIndex)
if flip_order:
left, right = right, left
- how = {'right': 'left', 'left': 'right'}.get(how, how)
+ how = {"right": "left", "left": "right"}.get(how, how)
level = left._get_level_number(level)
old_level = left.levels[level]
if not right.is_unique:
- raise NotImplementedError('Index._join_level on non-unique index '
- 'is not implemented')
+ raise NotImplementedError(
+ "Index._join_level on non-unique index " "is not implemented"
+ )
- new_level, left_lev_indexer, right_lev_indexer = \
- old_level.join(right, how=how, return_indexers=True)
+ new_level, left_lev_indexer, right_lev_indexer = old_level.join(
+ right, how=how, return_indexers=True
+ )
if left_lev_indexer is None:
if keep_order or len(left) == 0:
left_indexer = None
join_index = left
else: # sort the leaves
- left_indexer = _get_leaf_sorter(left.codes[:level + 1])
+ left_indexer = _get_leaf_sorter(left.codes[: level + 1])
join_index = left[left_indexer]
else:
left_lev_indexer = ensure_int64(left_lev_indexer)
- rev_indexer = lib.get_reverse_indexer(left_lev_indexer,
- len(old_level))
+ rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level))
- new_lev_codes = algos.take_nd(rev_indexer, left.codes[level],
- allow_fill=False)
+ new_lev_codes = algos.take_nd(
+ rev_indexer, left.codes[level], allow_fill=False
+ )
new_codes = list(left.codes)
new_codes[level] = new_lev_codes
@@ -3619,10 +3794,11 @@ def _get_leaf_sorter(labels):
if level == 0: # outer most level, take the fast route
ngroups = 1 + new_lev_codes.max()
left_indexer, counts = libalgos.groupsort_indexer(
- new_lev_codes, ngroups)
+ new_lev_codes, ngroups
+ )
# missing values are placed first; drop them!
- left_indexer = left_indexer[counts[0]:]
+ left_indexer = left_indexer[counts[0] :]
new_codes = [lab[left_indexer] for lab in new_codes]
else: # sort the leaves
@@ -3631,7 +3807,7 @@ def _get_leaf_sorter(labels):
if not mask_all:
new_codes = [lab[mask] for lab in new_codes]
- left_indexer = _get_leaf_sorter(new_codes[:level + 1])
+ left_indexer = _get_leaf_sorter(new_codes[: level + 1])
new_codes = [lab[left_indexer] for lab in new_codes]
# left_indexers are w.r.t masked frame.
@@ -3639,13 +3815,17 @@ def _get_leaf_sorter(labels):
if not mask_all:
left_indexer = mask.nonzero()[0][left_indexer]
- join_index = MultiIndex(levels=new_levels, codes=new_codes,
- names=left.names, verify_integrity=False)
+ join_index = MultiIndex(
+ levels=new_levels,
+ codes=new_codes,
+ names=left.names,
+ verify_integrity=False,
+ )
if right_lev_indexer is not None:
- right_indexer = algos.take_nd(right_lev_indexer,
- join_index.codes[level],
- allow_fill=False)
+ right_indexer = algos.take_nd(
+ right_lev_indexer, join_index.codes[level], allow_fill=False
+ )
else:
right_indexer = join_index.codes[level]
@@ -3653,17 +3833,19 @@ def _get_leaf_sorter(labels):
left_indexer, right_indexer = right_indexer, left_indexer
if return_indexers:
- left_indexer = (None if left_indexer is None
- else ensure_platform_int(left_indexer))
- right_indexer = (None if right_indexer is None
- else ensure_platform_int(right_indexer))
+ left_indexer = (
+ None if left_indexer is None else ensure_platform_int(left_indexer)
+ )
+ right_indexer = (
+ None if right_indexer is None else ensure_platform_int(right_indexer)
+ )
return join_index, left_indexer, right_indexer
else:
return join_index
- def _join_monotonic(self, other, how='left', return_indexers=False):
+ def _join_monotonic(self, other, how="left", return_indexers=False):
if self.equals(other):
- ret_index = other if how == 'right' else self
+ ret_index = other if how == "right" else self
if return_indexers:
return ret_index, None, None
else:
@@ -3674,28 +3856,28 @@ def _join_monotonic(self, other, how='left', return_indexers=False):
if self.is_unique and other.is_unique:
# We can perform much better than the general case
- if how == 'left':
+ if how == "left":
join_index = self
lidx = None
ridx = self._left_indexer_unique(sv, ov)
- elif how == 'right':
+ elif how == "right":
join_index = other
lidx = self._left_indexer_unique(ov, sv)
ridx = None
- elif how == 'inner':
+ elif how == "inner":
join_index, lidx, ridx = self._inner_indexer(sv, ov)
join_index = self._wrap_joined_index(join_index, other)
- elif how == 'outer':
+ elif how == "outer":
join_index, lidx, ridx = self._outer_indexer(sv, ov)
join_index = self._wrap_joined_index(join_index, other)
else:
- if how == 'left':
+ if how == "left":
join_index, lidx, ridx = self._left_indexer(sv, ov)
- elif how == 'right':
+ elif how == "right":
join_index, ridx, lidx = self._left_indexer(ov, sv)
- elif how == 'inner':
+ elif how == "inner":
join_index, lidx, ridx = self._inner_indexer(sv, ov)
- elif how == 'outer':
+ elif how == "outer":
join_index, lidx, ridx = self._outer_indexer(sv, ov)
join_index = self._wrap_joined_index(join_index, other)
@@ -3813,7 +3995,9 @@ def get_values(self):
warnings.warn(
"The 'get_values' method is deprecated and will be removed in a "
"future version. Use '.to_numpy()' or '.array' instead.",
- FutureWarning, stacklevel=2)
+ FutureWarning,
+ stacklevel=2,
+ )
return self._internal_get_values()
def _internal_get_values(self):
@@ -3827,7 +4011,9 @@ def memory_usage(self, deep=False):
result += self._engine.sizeof(deep=deep)
return result
- _index_shared_docs['where'] = """
+ _index_shared_docs[
+ "where"
+ ] = """
Return an Index of same shape as self and whose corresponding
entries are from self where cond is True and otherwise are from
other.
@@ -3844,7 +4030,7 @@ def memory_usage(self, deep=False):
Index
"""
- @Appender(_index_shared_docs['where'])
+ @Appender(_index_shared_docs["where"])
def where(self, cond, other=None):
if other is None:
other = self._na_value
@@ -3890,11 +4076,12 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype):
"""
from .numeric import Int64Index, UInt64Index
+
if not is_unsigned_integer_dtype(dtype):
# skip int64 conversion attempt if uint-like dtype is passed, as
# this could return Int64Index when UInt64Index is what's desired
try:
- res = data.astype('i8', copy=False)
+ res = data.astype("i8", copy=False)
if (res == data).all():
return Int64Index(res, copy=copy, name=name)
except (OverflowError, TypeError, ValueError):
@@ -3903,7 +4090,7 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype):
# Conversion to int64 failed (possibly due to overflow) or was skipped,
# so let's try now with uint64.
try:
- res = data.astype('u8', copy=False)
+ res = data.astype("u8", copy=False)
if (res == data).all():
return UInt64Index(res, copy=copy, name=name)
except (OverflowError, TypeError, ValueError):
@@ -3913,14 +4100,17 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype):
@classmethod
def _scalar_data_error(cls, data):
- raise TypeError('{0}(...) must be called with a collection of some '
- 'kind, {1} was passed'.format(cls.__name__,
- repr(data)))
+ raise TypeError(
+ "{0}(...) must be called with a collection of some "
+ "kind, {1} was passed".format(cls.__name__, repr(data))
+ )
@classmethod
def _string_data_error(cls, data):
- raise TypeError('String dtype not supported, you may need '
- 'to explicitly cast to a numeric type')
+ raise TypeError(
+ "String dtype not supported, you may need "
+ "to explicitly cast to a numeric type"
+ )
@classmethod
def _coerce_to_ndarray(cls, data):
@@ -4000,7 +4190,9 @@ def is_type_compatible(self, kind):
"""
return kind == self.inferred_type
- _index_shared_docs['contains'] = """
+ _index_shared_docs[
+ "contains"
+ ] = """
Return a boolean indicating whether the provided key is in the index.
Parameters
@@ -4030,7 +4222,7 @@ def is_type_compatible(self, kind):
False
"""
- @Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["contains"] % _index_doc_kwargs)
def __contains__(self, key):
hash(key)
try:
@@ -4052,7 +4244,10 @@ def contains(self, key):
warnings.warn(
"The 'contains' method is deprecated and will be removed in a "
"future version. Use 'key in index' instead of "
- "'index.contains(key)'", FutureWarning, stacklevel=2)
+ "'index.contains(key)'",
+ FutureWarning,
+ stacklevel=2,
+ )
return key in self
def __hash__(self):
@@ -4131,7 +4326,7 @@ def append(self, other):
for obj in to_concat:
if not isinstance(obj, Index):
- raise TypeError('all inputs must be Index')
+ raise TypeError("all inputs must be Index")
names = {obj.name for obj in to_concat}
name = None if len(names) > 1 else self.name
@@ -4197,8 +4392,9 @@ def equals(self, other):
return other.equals(self)
try:
- return array_equivalent(com.values_from_object(self),
- com.values_from_object(other))
+ return array_equivalent(
+ com.values_from_object(self), com.values_from_object(other)
+ )
except Exception:
return False
@@ -4213,10 +4409,16 @@ def identical(self, other):
If two Index objects have equal elements and same type True,
otherwise False.
"""
- return (self.equals(other) and
- all((getattr(self, c, None) == getattr(other, c, None)
- for c in self._comparables)) and
- type(self) == type(other))
+ return (
+ self.equals(other)
+ and all(
+ (
+ getattr(self, c, None) == getattr(other, c, None)
+ for c in self._comparables
+ )
+ )
+ and type(self) == type(other)
+ )
def asof(self, label):
"""
@@ -4275,7 +4477,7 @@ def asof(self, label):
ValueError: index must be monotonic increasing or decreasing
"""
try:
- loc = self.get_loc(label, method='pad')
+ loc = self.get_loc(label, method="pad")
except KeyError:
return self._na_value
else:
@@ -4312,7 +4514,7 @@ def asof_locs(self, where, mask):
which correspond to the return values of the `asof` function
for every element in `where`.
"""
- locs = self.values[mask].searchsorted(where.values, side='right')
+ locs = self.values[mask].searchsorted(where.values, side="right")
locs = np.where(locs > 0, locs - 1, 0)
result = np.arange(len(self))[mask].take(locs)
@@ -4380,8 +4582,9 @@ def sort(self, *args, **kwargs):
"""
Use sort_values instead.
"""
- raise TypeError("cannot sort an Index object in-place, use "
- "sort_values instead")
+ raise TypeError(
+ "cannot sort an Index object in-place, use " "sort_values instead"
+ )
def shift(self, periods=1, freq=None):
"""
@@ -4439,8 +4642,7 @@ def shift(self, periods=1, freq=None):
'2012-03-01'],
dtype='datetime64[ns]', freq='MS')
"""
- raise NotImplementedError("Not supported for type %s" %
- type(self).__name__)
+ raise NotImplementedError("Not supported for type %s" % type(self).__name__)
def argsort(self, *args, **kwargs):
"""
@@ -4482,7 +4684,9 @@ def argsort(self, *args, **kwargs):
result = np.array(self)
return result.argsort(*args, **kwargs)
- _index_shared_docs['get_value'] = """
+ _index_shared_docs[
+ "get_value"
+ ] = """
Fast lookup of value from 1-dimensional ndarray. Only use this if you
know what you're doing.
@@ -4492,13 +4696,13 @@ def argsort(self, *args, **kwargs):
A value in the Series with the index of the key value in self.
"""
- @Appender(_index_shared_docs['get_value'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs)
def get_value(self, series, key):
# if we have something that is Index-like, then
# use this, e.g. DatetimeIndex
# Things like `Series._get_value` (via .at) pass the EA directly here.
- s = getattr(series, '_values', series)
+ s = getattr(series, "_values", series)
if isinstance(s, (ExtensionArray, Index)) and is_scalar(key):
# GH 20882, 21257
# Unify Index and ExtensionArray treatment
@@ -4510,8 +4714,7 @@ def get_value(self, series, key):
iloc = self.get_loc(key)
return s[iloc]
except KeyError:
- if (len(self) > 0 and
- (self.holds_integer() or self.is_boolean())):
+ if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
raise
elif is_integer(key):
return s[key]
@@ -4519,10 +4722,9 @@ def get_value(self, series, key):
s = com.values_from_object(series)
k = com.values_from_object(key)
- k = self._convert_scalar_indexer(k, kind='getitem')
+ k = self._convert_scalar_indexer(k, kind="getitem")
try:
- return self._engine.get_value(s, k,
- tz=getattr(series.dtype, 'tz', None))
+ return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None))
except KeyError as e1:
if len(self) > 0 and (self.holds_integer() or self.is_boolean()):
raise
@@ -4553,10 +4755,13 @@ def set_value(self, arr, key, value):
-----
Only use this if you know what you're doing.
"""
- self._engine.set_value(com.values_from_object(arr),
- com.values_from_object(key), value)
+ self._engine.set_value(
+ com.values_from_object(arr), com.values_from_object(key), value
+ )
- _index_shared_docs['get_indexer_non_unique'] = """
+ _index_shared_docs[
+ "get_indexer_non_unique"
+ ] = """
Compute indexer and mask for new index given the current index. The
indexer should be then used as an input to ndarray.take to align the
current data to the new index.
@@ -4576,7 +4781,7 @@ def set_value(self, arr, key, value):
These correspond to the -1 in the indexer array.
"""
- @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
def get_indexer_non_unique(self, target):
target = ensure_index(target)
if is_categorical(target):
@@ -4614,11 +4819,12 @@ def get_indexer_for(self, target, **kwargs):
def _maybe_promote(self, other):
# A hack, but it works
from pandas import DatetimeIndex
- if self.inferred_type == 'date' and isinstance(other, DatetimeIndex):
+
+ if self.inferred_type == "date" and isinstance(other, DatetimeIndex):
return DatetimeIndex(self), other
- elif self.inferred_type == 'boolean':
+ elif self.inferred_type == "boolean":
if not is_object_dtype(self.dtype):
- return self.astype('object'), other.astype('object')
+ return self.astype("object"), other.astype("object")
return self, other
def groupby(self, values):
@@ -4669,6 +4875,7 @@ def map(self, mapper, na_action=None):
"""
from .multi import MultiIndex
+
new_values = super()._map_values(mapper, na_action=na_action)
attributes = self._get_attributes_dict()
@@ -4677,17 +4884,16 @@ def map(self, mapper, na_action=None):
if new_values.size and isinstance(new_values[0], tuple):
if isinstance(self, MultiIndex):
names = self.names
- elif attributes.get('name'):
- names = [attributes.get('name')] * len(new_values[0])
+ elif attributes.get("name"):
+ names = [attributes.get("name")] * len(new_values[0])
else:
names = None
- return MultiIndex.from_tuples(new_values,
- names=names)
+ return MultiIndex.from_tuples(new_values, names=names)
- attributes['copy'] = False
+ attributes["copy"] = False
if not new_values.size:
# empty
- attributes['dtype'] = self.dtype
+ attributes["dtype"] = self.dtype
return Index(new_values, **attributes)
@@ -4823,8 +5029,7 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
>>> idx.slice_indexer(start='b', end=('c', 'g'))
slice(1, 3)
"""
- start_slice, end_slice = self.slice_locs(start, end, step=step,
- kind=kind)
+ start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
# return a slice
if not is_scalar(start_slice):
@@ -4854,17 +5059,19 @@ def _validate_indexer(self, form, key, kind):
If we are positional indexer, validate that we have appropriate
typed bounds must be an integer.
"""
- assert kind in ['ix', 'loc', 'getitem', 'iloc']
+ assert kind in ["ix", "loc", "getitem", "iloc"]
if key is None:
pass
elif is_integer(key):
pass
- elif kind in ['iloc', 'getitem']:
+ elif kind in ["iloc", "getitem"]:
self._invalid_indexer(form, key)
return key
- _index_shared_docs['_maybe_cast_slice_bound'] = """
+ _index_shared_docs[
+ "_maybe_cast_slice_bound"
+ ] = """
This function should be overloaded in subclasses that allow non-trivial
casting on label-slice bounds, e.g. datetime-like indices allowing
strings containing formatted datetimes.
@@ -4885,38 +5092,38 @@ def _validate_indexer(self, form, key, kind):
"""
- @Appender(_index_shared_docs['_maybe_cast_slice_bound'])
+ @Appender(_index_shared_docs["_maybe_cast_slice_bound"])
def _maybe_cast_slice_bound(self, label, side, kind):
- assert kind in ['ix', 'loc', 'getitem', None]
+ assert kind in ["ix", "loc", "getitem", None]
# We are a plain index here (sub-class override this method if they
# wish to have special treatment for floats/ints, e.g. Float64Index and
# datetimelike Indexes
# reject them
if is_float(label):
- if not (kind in ['ix'] and (self.holds_integer() or
- self.is_floating())):
- self._invalid_indexer('slice', label)
+ if not (kind in ["ix"] and (self.holds_integer() or self.is_floating())):
+ self._invalid_indexer("slice", label)
# we are trying to find integer bounds on a non-integer based index
# this is rejected (generally .loc gets you here)
elif is_integer(label):
- self._invalid_indexer('slice', label)
+ self._invalid_indexer("slice", label)
return label
- def _searchsorted_monotonic(self, label, side='left'):
+ def _searchsorted_monotonic(self, label, side="left"):
if self.is_monotonic_increasing:
return self.searchsorted(label, side=side)
elif self.is_monotonic_decreasing:
# np.searchsorted expects ascending sort order, have to reverse
# everything for it to work (element ordering, search side and
# resulting value).
- pos = self[::-1].searchsorted(label, side='right' if side == 'left'
- else 'left')
+ pos = self[::-1].searchsorted(
+ label, side="right" if side == "left" else "left"
+ )
return len(self) - pos
- raise ValueError('index must be monotonic increasing or decreasing')
+ raise ValueError("index must be monotonic increasing or decreasing")
def get_slice_bound(self, label, side, kind):
"""
@@ -4936,12 +5143,13 @@ def get_slice_bound(self, label, side, kind):
int
Index of label.
"""
- assert kind in ['ix', 'loc', 'getitem', None]
+ assert kind in ["ix", "loc", "getitem", None]
- if side not in ('left', 'right'):
- raise ValueError("Invalid value for side kwarg,"
- " must be either 'left' or 'right': %s" %
- (side, ))
+ if side not in ("left", "right"):
+ raise ValueError(
+ "Invalid value for side kwarg,"
+ " must be either 'left' or 'right': %s" % (side,)
+ )
original_label = label
@@ -4963,20 +5171,22 @@ def get_slice_bound(self, label, side, kind):
# get_loc may return a boolean array or an array of indices, which
# is OK as long as they are representable by a slice.
if is_bool_dtype(slc):
- slc = lib.maybe_booleans_to_slice(slc.view('u1'))
+ slc = lib.maybe_booleans_to_slice(slc.view("u1"))
else:
- slc = lib.maybe_indices_to_slice(slc.astype('i8'), len(self))
+ slc = lib.maybe_indices_to_slice(slc.astype("i8"), len(self))
if isinstance(slc, np.ndarray):
- raise KeyError("Cannot get %s slice bound for non-unique "
- "label: %r" % (side, original_label))
+ raise KeyError(
+ "Cannot get %s slice bound for non-unique "
+ "label: %r" % (side, original_label)
+ )
if isinstance(slc, slice):
- if side == 'left':
+ if side == "left":
return slc.start
else:
return slc.stop
else:
- if side == 'right':
+ if side == "right":
return slc + 1
else:
return slc
@@ -5013,7 +5223,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
>>> idx.slice_locs(start='b', end='c')
(1, 3)
"""
- inc = (step is None or step >= 0)
+ inc = step is None or step >= 0
if not inc:
# If it's a reverse slice, temporarily swap bounds.
@@ -5021,8 +5231,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
# GH 16785: If start and end happen to be date strings with UTC offsets
# attempt to parse and check that the offsets are the same
- if (isinstance(start, (str, datetime))
- and isinstance(end, (str, datetime))):
+ if isinstance(start, (str, datetime)) and isinstance(end, (str, datetime)):
try:
ts_start = Timestamp(start)
ts_end = Timestamp(end)
@@ -5030,18 +5239,17 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
pass
else:
if not tz_compare(ts_start.tzinfo, ts_end.tzinfo):
- raise ValueError("Both dates must have the "
- "same UTC offset")
+ raise ValueError("Both dates must have the " "same UTC offset")
start_slice = None
if start is not None:
- start_slice = self.get_slice_bound(start, 'left', kind)
+ start_slice = self.get_slice_bound(start, "left", kind)
if start_slice is None:
start_slice = 0
end_slice = None
if end is not None:
- end_slice = self.get_slice_bound(end, 'right', kind)
+ end_slice = self.get_slice_bound(end, "right", kind)
if end_slice is None:
end_slice = len(self)
@@ -5102,7 +5310,7 @@ def insert(self, loc, item):
idx = np.concatenate((_self[:loc], item, _self[loc:]))
return self._shallow_copy_with_infer(idx)
- def drop(self, labels, errors='raise'):
+ def drop(self, labels, errors="raise"):
"""
Make new Index with passed list of labels deleted.
@@ -5121,14 +5329,13 @@ def drop(self, labels, errors='raise'):
KeyError
If not all of the labels are found in the selected axis
"""
- arr_dtype = 'object' if self.dtype == 'object' else None
+ arr_dtype = "object" if self.dtype == "object" else None
labels = com.index_labels_to_array(labels, dtype=arr_dtype)
indexer = self.get_indexer(labels)
mask = indexer == -1
if mask.any():
- if errors != 'ignore':
- raise KeyError(
- '{} not found in axis'.format(labels[mask]))
+ if errors != "ignore":
+ raise KeyError("{} not found in axis".format(labels[mask]))
indexer = indexer[~mask]
return self.delete(indexer)
@@ -5138,17 +5345,18 @@ def drop(self, labels, errors='raise'):
def _evaluate_with_timedelta_like(self, other, op):
# Timedelta knows how to operate with np.array, so dispatch to that
# operation and then wrap the results
- if self._is_numeric_dtype and op.__name__ in ['add', 'sub',
- 'radd', 'rsub']:
- raise TypeError("Operation {opname} between {cls} and {other} "
- "is invalid".format(opname=op.__name__,
- cls=self.dtype,
- other=type(other).__name__))
+ if self._is_numeric_dtype and op.__name__ in ["add", "sub", "radd", "rsub"]:
+ raise TypeError(
+ "Operation {opname} between {cls} and {other} "
+ "is invalid".format(
+ opname=op.__name__, cls=self.dtype, other=type(other).__name__
+ )
+ )
other = Timedelta(other)
values = self.values
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = op(values, other)
attrs = self._get_attributes_dict()
@@ -5177,32 +5385,32 @@ def _add_numeric_methods_add_sub_disabled(cls):
"""
Add in the numeric add/sub methods to disable.
"""
- cls.__add__ = make_invalid_op('__add__')
- cls.__radd__ = make_invalid_op('__radd__')
- cls.__iadd__ = make_invalid_op('__iadd__')
- cls.__sub__ = make_invalid_op('__sub__')
- cls.__rsub__ = make_invalid_op('__rsub__')
- cls.__isub__ = make_invalid_op('__isub__')
+ cls.__add__ = make_invalid_op("__add__")
+ cls.__radd__ = make_invalid_op("__radd__")
+ cls.__iadd__ = make_invalid_op("__iadd__")
+ cls.__sub__ = make_invalid_op("__sub__")
+ cls.__rsub__ = make_invalid_op("__rsub__")
+ cls.__isub__ = make_invalid_op("__isub__")
@classmethod
def _add_numeric_methods_disabled(cls):
"""
Add in numeric methods to disable other than add/sub.
"""
- cls.__pow__ = make_invalid_op('__pow__')
- cls.__rpow__ = make_invalid_op('__rpow__')
- cls.__mul__ = make_invalid_op('__mul__')
- cls.__rmul__ = make_invalid_op('__rmul__')
- cls.__floordiv__ = make_invalid_op('__floordiv__')
- cls.__rfloordiv__ = make_invalid_op('__rfloordiv__')
- cls.__truediv__ = make_invalid_op('__truediv__')
- cls.__rtruediv__ = make_invalid_op('__rtruediv__')
- cls.__mod__ = make_invalid_op('__mod__')
- cls.__divmod__ = make_invalid_op('__divmod__')
- cls.__neg__ = make_invalid_op('__neg__')
- cls.__pos__ = make_invalid_op('__pos__')
- cls.__abs__ = make_invalid_op('__abs__')
- cls.__inv__ = make_invalid_op('__inv__')
+ cls.__pow__ = make_invalid_op("__pow__")
+ cls.__rpow__ = make_invalid_op("__rpow__")
+ cls.__mul__ = make_invalid_op("__mul__")
+ cls.__rmul__ = make_invalid_op("__rmul__")
+ cls.__floordiv__ = make_invalid_op("__floordiv__")
+ cls.__rfloordiv__ = make_invalid_op("__rfloordiv__")
+ cls.__truediv__ = make_invalid_op("__truediv__")
+ cls.__rtruediv__ = make_invalid_op("__rtruediv__")
+ cls.__mod__ = make_invalid_op("__mod__")
+ cls.__divmod__ = make_invalid_op("__divmod__")
+ cls.__neg__ = make_invalid_op("__neg__")
+ cls.__pos__ = make_invalid_op("__pos__")
+ cls.__abs__ = make_invalid_op("__abs__")
+ cls.__inv__ = make_invalid_op("__inv__")
def _maybe_update_attributes(self, attrs):
"""
@@ -5215,9 +5423,10 @@ def _validate_for_numeric_unaryop(self, op, opstr):
Validate if we can perform a numeric unary operation.
"""
if not self._is_numeric_dtype:
- raise TypeError("cannot evaluate a numeric op "
- "{opstr} for type: {typ}"
- .format(opstr=opstr, typ=type(self).__name__))
+ raise TypeError(
+ "cannot evaluate a numeric op "
+ "{opstr} for type: {typ}".format(opstr=opstr, typ=type(self).__name__)
+ )
def _validate_for_numeric_binop(self, other, op):
"""
@@ -5228,30 +5437,32 @@ def _validate_for_numeric_binop(self, other, op):
-----
This is an internal method called by ops.
"""
- opstr = '__{opname}__'.format(opname=op.__name__)
+ opstr = "__{opname}__".format(opname=op.__name__)
# if we are an inheritor of numeric,
# but not actually numeric (e.g. DatetimeIndex/PeriodIndex)
if not self._is_numeric_dtype:
- raise TypeError("cannot evaluate a numeric op {opstr} "
- "for type: {typ}"
- .format(opstr=opstr, typ=type(self).__name__))
+ raise TypeError(
+ "cannot evaluate a numeric op {opstr} "
+ "for type: {typ}".format(opstr=opstr, typ=type(self).__name__)
+ )
if isinstance(other, Index):
if not other._is_numeric_dtype:
- raise TypeError("cannot evaluate a numeric op "
- "{opstr} with type: {typ}"
- .format(opstr=opstr, typ=type(other)))
+ raise TypeError(
+ "cannot evaluate a numeric op "
+ "{opstr} with type: {typ}".format(opstr=opstr, typ=type(other))
+ )
elif isinstance(other, np.ndarray) and not other.ndim:
other = other.item()
if isinstance(other, (Index, ABCSeries, np.ndarray)):
if len(self) != len(other):
- raise ValueError("cannot evaluate a numeric op with "
- "unequal lengths")
+ raise ValueError("cannot evaluate a numeric op with " "unequal lengths")
other = com.values_from_object(other)
- if other.dtype.kind not in ['f', 'i', 'u']:
- raise TypeError("cannot evaluate a numeric op "
- "with a non-numeric dtype")
+ if other.dtype.kind not in ["f", "i", "u"]:
+ raise TypeError(
+ "cannot evaluate a numeric op " "with a non-numeric dtype"
+ )
elif isinstance(other, (ABCDateOffset, np.timedelta64, timedelta)):
# higher up to handle
pass
@@ -5292,8 +5503,8 @@ def _add_numeric_methods_unary(cls):
"""
Add in numeric unary methods.
"""
- def _make_evaluate_unary(op, opstr):
+ def _make_evaluate_unary(op, opstr):
def _evaluate_numeric_unary(self):
self._validate_for_numeric_unaryop(op, opstr)
@@ -5304,10 +5515,10 @@ def _evaluate_numeric_unary(self):
_evaluate_numeric_unary.__name__ = opstr
return _evaluate_numeric_unary
- cls.__neg__ = _make_evaluate_unary(operator.neg, '__neg__')
- cls.__pos__ = _make_evaluate_unary(operator.pos, '__pos__')
- cls.__abs__ = _make_evaluate_unary(np.abs, '__abs__')
- cls.__inv__ = _make_evaluate_unary(lambda x: -x, '__inv__')
+ cls.__neg__ = _make_evaluate_unary(operator.neg, "__neg__")
+ cls.__pos__ = _make_evaluate_unary(operator.pos, "__pos__")
+ cls.__abs__ = _make_evaluate_unary(np.abs, "__abs__")
+ cls.__inv__ = _make_evaluate_unary(lambda x: -x, "__inv__")
@classmethod
def _add_numeric_methods(cls):
@@ -5334,7 +5545,8 @@ def _add_logical_methods(cls):
%(outname)s : bool or array_like (if axis is specified)
A single element array_like may be converted to bool."""
- _index_shared_docs['index_all'] = dedent("""
+ _index_shared_docs["index_all"] = dedent(
+ """
See Also
--------
@@ -5372,9 +5584,11 @@ def _add_logical_methods(cls):
>>> pd.Index([0, 0, 0]).any()
False
- """)
+ """
+ )
- _index_shared_docs['index_any'] = dedent("""
+ _index_shared_docs["index_any"] = dedent(
+ """
See Also
--------
@@ -5395,16 +5609,19 @@ def _add_logical_methods(cls):
>>> index = pd.Index([0, 0, 0])
>>> index.any()
False
- """)
+ """
+ )
def _make_logical_function(name, desc, f):
@Substitution(outname=name, desc=desc)
- @Appender(_index_shared_docs['index_' + name])
+ @Appender(_index_shared_docs["index_" + name])
@Appender(_doc)
def logical_func(self, *args, **kwargs):
result = f(self.values)
- if (isinstance(result, (np.ndarray, ABCSeries, Index)) and
- result.ndim == 0):
+ if (
+ isinstance(result, (np.ndarray, ABCSeries, Index))
+ and result.ndim == 0
+ ):
# return NumPy type
return result.dtype.type(result.item())
else: # pragma: no cover
@@ -5413,20 +5630,20 @@ def logical_func(self, *args, **kwargs):
logical_func.__name__ = name
return logical_func
- cls.all = _make_logical_function('all', 'Return whether all elements '
- 'are True.',
- np.all)
- cls.any = _make_logical_function('any',
- 'Return whether any element is True.',
- np.any)
+ cls.all = _make_logical_function(
+ "all", "Return whether all elements " "are True.", np.all
+ )
+ cls.any = _make_logical_function(
+ "any", "Return whether any element is True.", np.any
+ )
@classmethod
def _add_logical_methods_disabled(cls):
"""
Add in logical methods to disable.
"""
- cls.all = make_invalid_op('all')
- cls.any = make_invalid_op('any')
+ cls.all = make_invalid_op("all")
+ cls.any = make_invalid_op("any")
Index._add_numeric_methods_disabled()
@@ -5511,7 +5728,7 @@ def ensure_index(index_like, copy=False):
if copy:
index_like = index_like.copy()
return index_like
- if hasattr(index_like, 'name'):
+ if hasattr(index_like, "name"):
return Index(index_like, name=index_like.name, copy=copy)
if is_iterator(index_like):
@@ -5527,6 +5744,7 @@ def ensure_index(index_like, copy=False):
if len(converted) > 0 and all_arrays:
from .multi import MultiIndex
+
return MultiIndex.from_arrays(converted)
else:
index_like = converted
@@ -5535,6 +5753,7 @@ def ensure_index(index_like, copy=False):
# so only need to do this if not list instance
if copy:
from copy import copy
+
index_like = copy(index_like)
return Index(index_like)
@@ -5557,16 +5776,17 @@ def _trim_front(strings):
Trims zeros and decimal points.
"""
trimmed = strings
- while len(strings) > 0 and all(x[0] == ' ' for x in trimmed):
+ while len(strings) > 0 and all(x[0] == " " for x in trimmed):
trimmed = [x[1:] for x in trimmed]
return trimmed
def _validate_join_method(method):
- if method not in ['left', 'right', 'inner', 'outer']:
- raise ValueError('do not recognize join method %s' % method)
+ if method not in ["left", "right", "inner", "outer"]:
+ raise ValueError("do not recognize join method %s" % method)
def default_index(n):
from pandas.core.index import RangeIndex
+
return RangeIndex(0, n, name=None)
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 321297335cf23..9550d68f1d32b 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -12,8 +12,12 @@
from pandas.util._decorators import Appender, cache_readonly
from pandas.core.dtypes.common import (
- ensure_platform_int, is_categorical_dtype, is_interval_dtype, is_list_like,
- is_scalar)
+ ensure_platform_int,
+ is_categorical_dtype,
+ is_interval_dtype,
+ is_list_like,
+ is_scalar,
+)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.generic import ABCCategorical, ABCSeries
from pandas.core.dtypes.missing import isna
@@ -29,20 +33,26 @@
from pandas.core.ops import get_op_result_name
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
-_index_doc_kwargs.update(dict(target_klass='CategoricalIndex'))
+_index_doc_kwargs.update(dict(target_klass="CategoricalIndex"))
@accessor.delegate_names(
delegate=Categorical,
- accessors=["rename_categories",
- "reorder_categories",
- "add_categories",
- "remove_categories",
- "remove_unused_categories",
- "set_categories",
- "as_ordered", "as_unordered",
- "min", "max"],
- typ='method', overwrite=True)
+ accessors=[
+ "rename_categories",
+ "reorder_categories",
+ "add_categories",
+ "remove_categories",
+ "remove_unused_categories",
+ "set_categories",
+ "as_ordered",
+ "as_unordered",
+ "min",
+ "max",
+ ],
+ typ="method",
+ overwrite=True,
+)
class CategoricalIndex(Index, accessor.PandasDelegate):
"""
Index based on an underlying :class:`Categorical`.
@@ -134,37 +144,48 @@ class CategoricalIndex(Index, accessor.PandasDelegate):
'c'
"""
- _typ = 'categoricalindex'
+ _typ = "categoricalindex"
@property
def _engine_type(self):
# self.codes can have dtype int8, int16, int32 or int64, so we need
# to return the corresponding engine type (libindex.Int8Engine, etc.).
- return {np.int8: libindex.Int8Engine,
- np.int16: libindex.Int16Engine,
- np.int32: libindex.Int32Engine,
- np.int64: libindex.Int64Engine,
- }[self.codes.dtype.type]
+ return {
+ np.int8: libindex.Int8Engine,
+ np.int16: libindex.Int16Engine,
+ np.int32: libindex.Int32Engine,
+ np.int64: libindex.Int64Engine,
+ }[self.codes.dtype.type]
- _attributes = ['name']
+ _attributes = ["name"]
# --------------------------------------------------------------------
# Constructors
- def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
- copy=False, name=None, fastpath=None):
+ def __new__(
+ cls,
+ data=None,
+ categories=None,
+ ordered=None,
+ dtype=None,
+ copy=False,
+ name=None,
+ fastpath=None,
+ ):
if fastpath is not None:
- warnings.warn("The 'fastpath' keyword is deprecated, and will be "
- "removed in a future version.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "The 'fastpath' keyword is deprecated, and will be "
+ "removed in a future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
if fastpath:
return cls._simple_new(data, name=name, dtype=dtype)
- dtype = CategoricalDtype._from_values_or_dtype(data, categories,
- ordered, dtype)
+ dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype)
- if name is None and hasattr(data, 'name'):
+ if name is None and hasattr(data, "name"):
name = data.name
if not is_categorical_dtype(data):
@@ -221,8 +242,7 @@ def _create_categorical(cls, data, dtype=None):
-------
Categorical
"""
- if (isinstance(data, (cls, ABCSeries)) and
- is_categorical_dtype(data)):
+ if isinstance(data, (cls, ABCSeries)) and is_categorical_dtype(data):
data = data.values
if not isinstance(data, ABCCategorical):
@@ -248,7 +268,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
# --------------------------------------------------------------------
- @Appender(_index_shared_docs['_shallow_copy'])
+ @Appender(_index_shared_docs["_shallow_copy"])
def _shallow_copy(self, values=None, dtype=None, **kwargs):
if dtype is None:
dtype = self.dtype
@@ -269,17 +289,18 @@ def _is_dtype_compat(self, other):
if isinstance(other, CategoricalIndex):
other = other._values
if not other.is_dtype_equal(self):
- raise TypeError("categories must match existing categories "
- "when appending")
+ raise TypeError(
+ "categories must match existing categories " "when appending"
+ )
else:
values = other
if not is_list_like(values):
values = [values]
- other = CategoricalIndex(self._create_categorical(
- other, dtype=self.dtype))
+ other = CategoricalIndex(self._create_categorical(other, dtype=self.dtype))
if not other.isin(values).all():
- raise TypeError("cannot append a non-category item to a "
- "CategoricalIndex")
+ raise TypeError(
+ "cannot append a non-category item to a " "CategoricalIndex"
+ )
return other
@@ -320,26 +341,31 @@ def _format_attrs(self):
"""
Return a list of tuples of the (attr,formatted_value)
"""
- max_categories = (10 if get_option("display.max_categories") == 0 else
- get_option("display.max_categories"))
+ max_categories = (
+ 10
+ if get_option("display.max_categories") == 0
+ else get_option("display.max_categories")
+ )
attrs = [
- ('categories',
- ibase.default_pprint(self.categories,
- max_seq_items=max_categories)),
- ('ordered', self.ordered)]
+ (
+ "categories",
+ ibase.default_pprint(self.categories, max_seq_items=max_categories),
+ ),
+ ("ordered", self.ordered),
+ ]
if self.name is not None:
- attrs.append(('name', ibase.default_pprint(self.name)))
- attrs.append(('dtype', "'%s'" % self.dtype.name))
- max_seq_items = get_option('display.max_seq_items') or len(self)
+ attrs.append(("name", ibase.default_pprint(self.name)))
+ attrs.append(("dtype", "'%s'" % self.dtype.name))
+ max_seq_items = get_option("display.max_seq_items") or len(self)
if len(self) > max_seq_items:
- attrs.append(('length', len(self)))
+ attrs.append(("length", len(self)))
return attrs
# --------------------------------------------------------------------
@property
def inferred_type(self):
- return 'categorical'
+ return "categorical"
@property
def values(self):
@@ -378,7 +404,7 @@ def ordered(self):
def _reverse_indexer(self):
return self._data._reverse_indexer()
- @Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["contains"] % _index_doc_kwargs)
def __contains__(self, key):
# if key is a NaN, check if any NaN is in self.
if isna(key):
@@ -390,10 +416,11 @@ def __array__(self, dtype=None):
""" the array interface, return my values """
return np.array(self._data, dtype=dtype)
- @Appender(_index_shared_docs['astype'])
+ @Appender(_index_shared_docs["astype"])
def astype(self, dtype, copy=True):
if is_interval_dtype(dtype):
from pandas import IntervalIndex
+
return IntervalIndex(np.array(self))
elif is_categorical_dtype(dtype):
# GH 18630
@@ -408,7 +435,7 @@ def _isnan(self):
""" return if each value is nan"""
return self._data.codes == -1
- @Appender(ibase._index_shared_docs['fillna'])
+ @Appender(ibase._index_shared_docs["fillna"])
def fillna(self, value, downcast=None):
self._assert_can_do_op(value)
return CategoricalIndex(self._data.fillna(value), name=self.name)
@@ -435,7 +462,7 @@ def is_monotonic_increasing(self):
def is_monotonic_decreasing(self):
return self._engine.is_monotonic_decreasing
- @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs)
def unique(self, level=None):
if level is not None:
self._validate_index_level(level)
@@ -445,14 +472,15 @@ def unique(self, level=None):
return self._shallow_copy(result, dtype=result.dtype)
@Appender(Index.duplicated.__doc__)
- def duplicated(self, keep='first'):
+ def duplicated(self, keep="first"):
from pandas._libs.hashtable import duplicated_int64
- codes = self.codes.astype('i8')
+
+ codes = self.codes.astype("i8")
return duplicated_int64(codes, keep)
def _to_safe_for_reshape(self):
""" convert to object if we are a categorical """
- return self.astype('object')
+ return self.astype("object")
def get_loc(self, key, method=None):
"""
@@ -493,9 +521,7 @@ def get_loc(self, key, method=None):
except KeyError:
raise KeyError(key)
- def get_value(self,
- series: AnyArrayLike,
- key: Any):
+ def get_value(self, series: AnyArrayLike, key: Any):
"""
Fast lookup of value from 1-dimensional ndarray. Only use this if you
know what you're doing
@@ -515,7 +541,7 @@ def get_value(self,
"""
try:
k = com.values_from_object(key)
- k = self._convert_scalar_indexer(k, kind='getitem')
+ k = self._convert_scalar_indexer(k, kind="getitem")
indexer = self.get_loc(k)
return series.take([indexer])[0]
except (KeyError, TypeError):
@@ -528,7 +554,7 @@ def _can_reindex(self, indexer):
""" always allow reindexing """
pass
- @Appender(_index_shared_docs['where'])
+ @Appender(_index_shared_docs["where"])
def where(self, cond, other=None):
# TODO: Investigate an alternative implementation with
# 1. copy the underlying Categorical
@@ -540,8 +566,7 @@ def where(self, cond, other=None):
cat = Categorical(values, dtype=self.dtype)
return self._shallow_copy(cat, **self._get_attributes_dict())
- def reindex(self, target, method=None, level=None, limit=None,
- tolerance=None):
+ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
"""
Create index with target's values (move/add/delete values as necessary)
@@ -555,14 +580,17 @@ def reindex(self, target, method=None, level=None, limit=None,
"""
if method is not None:
- raise NotImplementedError("argument method is not implemented for "
- "CategoricalIndex.reindex")
+ raise NotImplementedError(
+ "argument method is not implemented for " "CategoricalIndex.reindex"
+ )
if level is not None:
- raise NotImplementedError("argument level is not implemented for "
- "CategoricalIndex.reindex")
+ raise NotImplementedError(
+ "argument level is not implemented for " "CategoricalIndex.reindex"
+ )
if limit is not None:
- raise NotImplementedError("argument limit is not implemented for "
- "CategoricalIndex.reindex")
+ raise NotImplementedError(
+ "argument limit is not implemented for " "CategoricalIndex.reindex"
+ )
target = ibase.ensure_index(target)
@@ -587,8 +615,7 @@ def reindex(self, target, method=None, level=None, limit=None,
if (cats == -1).any():
# coerce to a regular index here!
result = Index(np.array(self), name=self.name)
- new_target, indexer, _ = result._reindex_non_unique(
- np.array(target))
+ new_target, indexer, _ = result._reindex_non_unique(np.array(target))
else:
codes = new_target.codes.copy()
@@ -628,7 +655,7 @@ def _reindex_non_unique(self, target):
return new_target, indexer, new_indexer
- @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def get_indexer(self, target, method=None, limit=None, tolerance=None):
from pandas.core.arrays.categorical import _recode_for_categories
@@ -636,24 +663,26 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
target = ibase.ensure_index(target)
if self.is_unique and self.equals(target):
- return np.arange(len(self), dtype='intp')
-
- if method == 'pad' or method == 'backfill':
- raise NotImplementedError("method='pad' and method='backfill' not "
- "implemented yet for CategoricalIndex")
- elif method == 'nearest':
- raise NotImplementedError("method='nearest' not implemented yet "
- 'for CategoricalIndex')
-
- if (isinstance(target, CategoricalIndex) and
- self.values.is_dtype_equal(target)):
+ return np.arange(len(self), dtype="intp")
+
+ if method == "pad" or method == "backfill":
+ raise NotImplementedError(
+ "method='pad' and method='backfill' not "
+ "implemented yet for CategoricalIndex"
+ )
+ elif method == "nearest":
+ raise NotImplementedError(
+ "method='nearest' not implemented yet " "for CategoricalIndex"
+ )
+
+ if isinstance(target, CategoricalIndex) and self.values.is_dtype_equal(target):
if self.values.equals(target.values):
# we have the same codes
codes = target.codes
else:
- codes = _recode_for_categories(target.codes,
- target.categories,
- self.values.categories)
+ codes = _recode_for_categories(
+ target.codes, target.categories, self.values.categories
+ )
else:
if isinstance(target, CategoricalIndex):
code_indexer = self.categories.get_indexer(target.categories)
@@ -664,7 +693,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
indexer, _ = self._engine.get_indexer_non_unique(codes)
return ensure_platform_int(indexer)
- @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
def get_indexer_non_unique(self, target):
target = ibase.ensure_index(target)
@@ -680,14 +709,14 @@ def get_indexer_non_unique(self, target):
indexer, missing = self._engine.get_indexer_non_unique(codes)
return ensure_platform_int(indexer), missing
- @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ @Appender(_index_shared_docs["_convert_scalar_indexer"])
def _convert_scalar_indexer(self, key, kind=None):
if self.categories._defer_to_indexing:
return self.categories._convert_scalar_indexer(key, kind=kind)
return super()._convert_scalar_indexer(key, kind=kind)
- @Appender(_index_shared_docs['_convert_list_indexer'])
+ @Appender(_index_shared_docs["_convert_list_indexer"])
def _convert_list_indexer(self, keyarr, kind=None):
# Return our indexer or raise if all of the values are not included in
# the categories
@@ -701,11 +730,12 @@ def _convert_list_indexer(self, keyarr, kind=None):
raise KeyError(
"a list-indexer must only "
"include values that are "
- "in the categories")
+ "in the categories"
+ )
return self.get_indexer(keyarr)
- @Appender(_index_shared_docs['_convert_arr_indexer'])
+ @Appender(_index_shared_docs["_convert_arr_indexer"])
def _convert_arr_indexer(self, keyarr):
keyarr = com.asarray_tuplesafe(keyarr)
@@ -714,19 +744,21 @@ def _convert_arr_indexer(self, keyarr):
return self._shallow_copy(keyarr)
- @Appender(_index_shared_docs['_convert_index_indexer'])
+ @Appender(_index_shared_docs["_convert_index_indexer"])
def _convert_index_indexer(self, keyarr):
return self._shallow_copy(keyarr)
- @Appender(_index_shared_docs['take'] % _index_doc_kwargs)
- def take(self, indices, axis=0, allow_fill=True,
- fill_value=None, **kwargs):
+ @Appender(_index_shared_docs["take"] % _index_doc_kwargs)
+ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs):
nv.validate_take(tuple(), kwargs)
indices = ensure_platform_int(indices)
- taken = self._assert_take_fillable(self.codes, indices,
- allow_fill=allow_fill,
- fill_value=fill_value,
- na_value=-1)
+ taken = self._assert_take_fillable(
+ self.codes,
+ indices,
+ allow_fill=allow_fill,
+ fill_value=fill_value,
+ na_value=-1,
+ )
return self._create_from_codes(taken)
def is_dtype_equal(self, other):
@@ -834,8 +866,10 @@ def insert(self, loc, item):
"""
code = self.categories.get_indexer([item])
if (code == -1) and not (is_scalar(item) and isna(item)):
- raise TypeError("cannot insert an item into a CategoricalIndex "
- "that is not already an existing category")
+ raise TypeError(
+ "cannot insert an item into a CategoricalIndex "
+ "that is not already an existing category"
+ )
codes = self.codes
codes = np.concatenate((codes[:loc], code, codes[loc:]))
@@ -850,8 +884,7 @@ def _concat_same_dtype(self, to_concat, name):
Concatenate to_concat which has the same class
ValueError if other is not in the categories
"""
- codes = np.concatenate([self._is_dtype_compat(c).codes
- for c in to_concat])
+ codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat])
result = self._create_from_codes(codes, name=name)
# if name is None, _create_from_codes sets self.name
result.name = name
@@ -866,7 +899,7 @@ def _add_comparison_methods(cls):
""" add in comparison methods """
def _make_compare(op):
- opname = '__{op}__'.format(op=op.__name__)
+ opname = "__{op}__".format(op=op.__name__)
def _evaluate_compare(self, other):
@@ -875,19 +908,19 @@ def _evaluate_compare(self, other):
if isinstance(other, CategoricalIndex):
other = other._values
elif isinstance(other, Index):
- other = self._create_categorical(
- other._values, dtype=self.dtype)
+ other = self._create_categorical(other._values, dtype=self.dtype)
- if isinstance(other, (ABCCategorical, np.ndarray,
- ABCSeries)):
+ if isinstance(other, (ABCCategorical, np.ndarray, ABCSeries)):
if len(self.values) != len(other):
raise ValueError("Lengths must match to compare")
if isinstance(other, ABCCategorical):
if not self.values.is_dtype_equal(other):
- raise TypeError("categorical index comparisons must "
- "have the same categories and ordered "
- "attributes")
+ raise TypeError(
+ "categorical index comparisons must "
+ "have the same categories and ordered "
+ "attributes"
+ )
result = op(self.values, other)
if isinstance(result, ABCSeries):
@@ -908,7 +941,7 @@ def _evaluate_compare(self, other):
def _delegate_method(self, name, *args, **kwargs):
""" method delegation to the ._values """
method = getattr(self._values, name)
- if 'inplace' in kwargs:
+ if "inplace" in kwargs:
raise ValueError("cannot use inplace with CategoricalIndex")
res = method(*args, **kwargs)
if is_scalar(res):
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index e141f7b5c5b23..f2e6f631ae9ee 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -14,15 +14,23 @@
from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg
from pandas.core.dtypes.common import (
- ensure_int64, is_dtype_equal, is_float, is_integer, is_list_like,
- is_period_dtype, is_scalar)
+ ensure_int64,
+ is_dtype_equal,
+ is_float,
+ is_integer,
+ is_list_like,
+ is_period_dtype,
+ is_scalar,
+)
from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries
from pandas.core import algorithms, ops
from pandas.core.accessor import PandasDelegate
from pandas.core.arrays import ExtensionOpsMixin
from pandas.core.arrays.datetimelike import (
- DatetimeLikeArrayMixin, _ensure_datetimelike_to_i8)
+ DatetimeLikeArrayMixin,
+ _ensure_datetimelike_to_i8,
+)
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import Index, _index_shared_docs
from pandas.core.tools.timedeltas import to_timedelta
@@ -58,24 +66,24 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin):
"""
common ops mixin to support a unified interface datetimelike Index
"""
+
_data = None
# DatetimeLikeArrayMixin assumes subclasses are mutable, so these are
# properties there. They can be made into cache_readonly for Index
# subclasses bc they are immutable
inferred_freq = cache_readonly(
- DatetimeLikeArrayMixin.inferred_freq.fget) # type: ignore
+ DatetimeLikeArrayMixin.inferred_freq.fget
+ ) # type: ignore
_isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) # type: ignore
- hasnans = cache_readonly(
- DatetimeLikeArrayMixin._hasnans.fget) # type: ignore
+ hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore
_hasnans = hasnans # for index / array -agnostic code
_resolution = cache_readonly(
- DatetimeLikeArrayMixin._resolution.fget) # type: ignore
- resolution = cache_readonly(
- DatetimeLikeArrayMixin.resolution.fget) # type: ignore
+ DatetimeLikeArrayMixin._resolution.fget
+ ) # type: ignore
+ resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) # type: ignore
- _maybe_mask_results = ea_passthrough(
- DatetimeLikeArrayMixin._maybe_mask_results)
+ _maybe_mask_results = ea_passthrough(DatetimeLikeArrayMixin._maybe_mask_results)
__iter__ = ea_passthrough(DatetimeLikeArrayMixin.__iter__)
mean = ea_passthrough(DatetimeLikeArrayMixin.mean)
@@ -114,6 +122,7 @@ def _create_comparison_method(cls, op):
"""
Create a comparison method that dispatches to ``cls.values``.
"""
+
def wrapper(self, other):
if isinstance(other, ABCSeries):
# the arrays defer to Series for comparison ops but the indexes
@@ -124,7 +133,7 @@ def wrapper(self, other):
return result
wrapper.__doc__ = op.__doc__
- wrapper.__name__ = '__{}__'.format(op.__name__)
+ wrapper.__name__ = "__{}__".format(op.__name__)
return wrapper
@property
@@ -182,12 +191,14 @@ def _join_i8_wrapper(joinf, dtype, with_indexers=True):
@staticmethod
def wrapper(left, right):
- if isinstance(left, (np.ndarray, ABCIndex, ABCSeries,
- DatetimeLikeArrayMixin)):
- left = left.view('i8')
- if isinstance(right, (np.ndarray, ABCIndex, ABCSeries,
- DatetimeLikeArrayMixin)):
- right = right.view('i8')
+ if isinstance(
+ left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)
+ ):
+ left = left.view("i8")
+ if isinstance(
+ right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)
+ ):
+ right = right.view("i8")
results = joinf(left, right)
if with_indexers:
join_index, left_indexer, right_indexer = results
@@ -197,27 +208,30 @@ def wrapper(left, right):
return wrapper
- def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise',
- from_utc=False):
+ def _ensure_localized(
+ self, arg, ambiguous="raise", nonexistent="raise", from_utc=False
+ ):
# See DatetimeLikeArrayMixin._ensure_localized.__doc__
- if getattr(self, 'tz', None):
+ if getattr(self, "tz", None):
# ensure_localized is only relevant for tz-aware DTI
- result = self._data._ensure_localized(arg,
- ambiguous=ambiguous,
- nonexistent=nonexistent,
- from_utc=from_utc)
+ result = self._data._ensure_localized(
+ arg, ambiguous=ambiguous, nonexistent=nonexistent, from_utc=from_utc
+ )
return type(self)._simple_new(result, name=self.name)
return arg
def _box_values(self, values):
return self._data._box_values(values)
- @Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["contains"] % _index_doc_kwargs)
def __contains__(self, key):
try:
res = self.get_loc(key)
- return (is_scalar(res) or isinstance(res, slice) or
- (is_list_like(res) and len(res)))
+ return (
+ is_scalar(res)
+ or isinstance(res, slice)
+ or (is_list_like(res) and len(res))
+ )
except (KeyError, TypeError, ValueError):
return False
@@ -232,7 +246,7 @@ def map(self, mapper, na_action=None):
result = Index(result)
if not isinstance(result, Index):
- raise TypeError('The map function must return an Index object')
+ raise TypeError("The map function must return an Index object")
return result
except Exception:
return self.astype(object).map(mapper)
@@ -250,23 +264,22 @@ def sort_values(self, return_indexer=False, ascending=True):
else:
sorted_values = np.sort(self._ndarray_values)
attribs = self._get_attributes_dict()
- freq = attribs['freq']
+ freq = attribs["freq"]
if freq is not None and not is_period_dtype(self):
if freq.n > 0 and not ascending:
freq = freq * -1
elif freq.n < 0 and ascending:
freq = freq * -1
- attribs['freq'] = freq
+ attribs["freq"] = freq
if not ascending:
sorted_values = sorted_values[::-1]
return self._simple_new(sorted_values, **attribs)
- @Appender(_index_shared_docs['take'] % _index_doc_kwargs)
- def take(self, indices, axis=0, allow_fill=True,
- fill_value=None, **kwargs):
+ @Appender(_index_shared_docs["take"] % _index_doc_kwargs)
+ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs):
nv.validate_take(tuple(), kwargs)
indices = ensure_int64(indices)
@@ -274,10 +287,13 @@ def take(self, indices, axis=0, allow_fill=True,
if isinstance(maybe_slice, slice):
return self[maybe_slice]
- taken = self._assert_take_fillable(self.asi8, indices,
- allow_fill=allow_fill,
- fill_value=fill_value,
- na_value=iNaT)
+ taken = self._assert_take_fillable(
+ self.asi8,
+ indices,
+ allow_fill=allow_fill,
+ fill_value=fill_value,
+ na_value=iNaT,
+ )
# keep freq in PeriodArray/Index, reset otherwise
freq = self.freq if is_period_dtype(self) else None
@@ -298,16 +314,18 @@ def asobject(self):
*this is an internal non-public method*
"""
- warnings.warn("'asobject' is deprecated. Use 'astype(object)'"
- " instead", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "'asobject' is deprecated. Use 'astype(object)'" " instead",
+ FutureWarning,
+ stacklevel=2,
+ )
return self.astype(object)
def _convert_tolerance(self, tolerance, target):
tolerance = np.asarray(to_timedelta(tolerance).to_numpy())
if target.size != tolerance.size and tolerance.size > 1:
- raise ValueError('list-like tolerance size must match '
- 'target index size')
+ raise ValueError("list-like tolerance size must match " "target index size")
return tolerance
def tolist(self):
@@ -370,7 +388,7 @@ def argmin(self, axis=None, skipna=True, *args, **kwargs):
if mask.all() or not skipna:
return -1
i8 = i8.copy()
- i8[mask] = np.iinfo('int64').max
+ i8[mask] = np.iinfo("int64").max
return i8.argmin()
def max(self, axis=None, skipna=True, *args, **kwargs):
@@ -433,7 +451,7 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs):
# --------------------------------------------------------------------
# Rendering Methods
- def _format_with_header(self, header, na_rep='NaT', **kwargs):
+ def _format_with_header(self, header, na_rep="NaT", **kwargs):
return header + list(self._format_native_types(na_rep, **kwargs))
@property
@@ -446,11 +464,11 @@ def _format_attrs(self):
"""
attrs = super()._format_attrs()
for attrib in self._attributes:
- if attrib == 'freq':
+ if attrib == "freq":
freq = self.freqstr
if freq is not None:
freq = "'%s'" % freq
- attrs.append(('freq', freq))
+ attrs.append(("freq", freq))
return attrs
# --------------------------------------------------------------------
@@ -466,17 +484,17 @@ def _convert_scalar_indexer(self, key, kind=None):
kind : {'ix', 'loc', 'getitem', 'iloc'} or None
"""
- assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+ assert kind in ["ix", "loc", "getitem", "iloc", None]
# we don't allow integer/float indexing for loc
# we don't allow float indexing for ix/getitem
if is_scalar(key):
is_int = is_integer(key)
is_flt = is_float(key)
- if kind in ['loc'] and (is_int or is_flt):
- self._invalid_indexer('index', key)
- elif kind in ['ix', 'getitem'] and is_flt:
- self._invalid_indexer('index', key)
+ if kind in ["loc"] and (is_int or is_flt):
+ self._invalid_indexer("index", key)
+ elif kind in ["ix", "getitem"] and is_flt:
+ self._invalid_indexer("index", key)
return super()._convert_scalar_indexer(key, kind=kind)
@@ -497,6 +515,7 @@ def __add__(self, other):
def __radd__(self, other):
# alias for __add__
return self.__add__(other)
+
cls.__radd__ = __radd__
def __sub__(self, other):
@@ -555,21 +574,24 @@ def intersection(self, other, sort=False):
result.freq = to_offset(result.inferred_freq)
return result
- elif (other.freq is None or self.freq is None or
- other.freq != self.freq or
- not other.freq.isAnchored() or
- (not self.is_monotonic or not other.is_monotonic)):
+ elif (
+ other.freq is None
+ or self.freq is None
+ or other.freq != self.freq
+ or not other.freq.isAnchored()
+ or (not self.is_monotonic or not other.is_monotonic)
+ ):
result = Index.intersection(self, other, sort=sort)
# Invalidate the freq of `result`, which may not be correct at
# this point, depending on the values.
result.freq = None
- if hasattr(self, 'tz'):
- result = self._shallow_copy(result._values, name=result.name,
- tz=result.tz, freq=None)
+ if hasattr(self, "tz"):
+ result = self._shallow_copy(
+ result._values, name=result.name, tz=result.tz, freq=None
+ )
else:
- result = self._shallow_copy(result._values, name=result.name,
- freq=None)
+ result = self._shallow_copy(result._values, name=result.name, freq=None)
if result.freq is None:
result.freq = to_offset(result.inferred_freq)
return result
@@ -592,17 +614,17 @@ def intersection(self, other, sort=False):
left_chunk = left.values[lslice]
return self._shallow_copy(left_chunk)
- @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs)
def repeat(self, repeats, axis=None):
nv.validate_repeat(tuple(), dict(axis=axis))
freq = self.freq if is_period_dtype(self) else None
return self._shallow_copy(self.asi8.repeat(repeats), freq=freq)
- @Appender(_index_shared_docs['where'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["where"] % _index_doc_kwargs)
def where(self, cond, other=None):
other = _ensure_datetimelike_to_i8(other, to_utc=True)
values = _ensure_datetimelike_to_i8(self, to_utc=True)
- result = np.where(cond, values, other).astype('i8')
+ result = np.where(cond, values, other).astype("i8")
result = self._ensure_localized(result, from_utc=True)
return self._shallow_copy(result)
@@ -622,17 +644,19 @@ def _summary(self, name=None):
"""
formatter = self._formatter_func
if len(self) > 0:
- index_summary = ', %s to %s' % (formatter(self[0]),
- formatter(self[-1]))
+ index_summary = ", %s to %s" % (formatter(self[0]), formatter(self[-1]))
else:
- index_summary = ''
+ index_summary = ""
if name is None:
name = type(self).__name__
- result = '%s: %s entries%s' % (printing.pprint_thing(name),
- len(self), index_summary)
+ result = "%s: %s entries%s" % (
+ printing.pprint_thing(name),
+ len(self),
+ index_summary,
+ )
if self.freq:
- result += '\nFreq: %s' % self.freqstr
+ result += "\nFreq: %s" % self.freqstr
# display as values, not quoted
result = result.replace("'", "")
@@ -643,10 +667,10 @@ def _concat_same_dtype(self, to_concat, name):
Concatenate to_concat which has the same class.
"""
attribs = self._get_attributes_dict()
- attribs['name'] = name
+ attribs["name"] = name
# do not pass tz to set because tzlocal cannot be hashed
if len({str(x.dtype) for x in to_concat}) != 1:
- raise ValueError('to_concat must have the same tz')
+ raise ValueError("to_concat must have the same tz")
new_data = type(self._values)._concat_same_type(to_concat).asi8
@@ -655,11 +679,11 @@ def _concat_same_dtype(self, to_concat, name):
is_diff_evenly_spaced = len(unique_deltas(new_data)) == 1
if not is_period_dtype(self) and not is_diff_evenly_spaced:
# reset freq
- attribs['freq'] = None
+ attribs["freq"] = None
return self._simple_new(new_data, **attribs)
- @Appender(_index_shared_docs['astype'])
+ @Appender(_index_shared_docs["astype"])
def astype(self, dtype, copy=True):
if is_dtype_equal(self.dtype, dtype) and copy is False:
# Ensure that self.astype(self.dtype) is self
@@ -669,10 +693,9 @@ def astype(self, dtype, copy=True):
# pass copy=False because any copying will be done in the
# _data.astype call above
- return Index(new_values,
- dtype=new_values.dtype, name=self.name, copy=False)
+ return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False)
- @deprecate_kwarg(old_arg_name='n', new_arg_name='periods')
+ @deprecate_kwarg(old_arg_name="n", new_arg_name="periods")
def shift(self, periods, freq=None):
"""
Shift index by desired number of time frequency increments.
@@ -714,8 +737,10 @@ def wrap_arithmetic_op(self, other, result):
if isinstance(result, tuple):
# divmod, rdivmod
assert len(result) == 2
- return (wrap_arithmetic_op(self, other, result[0]),
- wrap_arithmetic_op(self, other, result[1]))
+ return (
+ wrap_arithmetic_op(self, other, result[0]),
+ wrap_arithmetic_op(self, other, result[1]),
+ )
if not isinstance(result, Index):
# Index.__new__ will choose appropriate subclass for dtype
@@ -763,6 +788,7 @@ class DatetimelikeDelegateMixin(PandasDelegate):
The set of properties whose results should should *not* be
boxed in an index, after being returned from the array
"""
+
# raw_methods : dispatch methods that shouldn't be boxed in an Index
_raw_methods = set() # type: Set[str]
# raw_properties : dispatch properties that shouldn't be boxed in an Index
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index e2658b66f83ba..5024eebe03bb4 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -10,20 +10,33 @@
from pandas.util._decorators import Appender, Substitution, cache_readonly
from pandas.core.dtypes.common import (
- _NS_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar,
- is_string_like)
+ _NS_DTYPE,
+ ensure_int64,
+ is_float,
+ is_integer,
+ is_list_like,
+ is_scalar,
+ is_string_like,
+)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.missing import isna
from pandas.core.accessor import delegate_names
from pandas.core.arrays.datetimes import (
- DatetimeArray, _to_M8, tz_to_dtype, validate_tz_from_dtype)
+ DatetimeArray,
+ _to_M8,
+ tz_to_dtype,
+ validate_tz_from_dtype,
+)
from pandas.core.base import _shared_docs
import pandas.core.common as com
from pandas.core.indexes.base import Index
from pandas.core.indexes.datetimelike import (
- DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, ea_passthrough)
+ DatetimeIndexOpsMixin,
+ DatetimelikeDelegateMixin,
+ ea_passthrough,
+)
from pandas.core.indexes.numeric import Int64Index
from pandas.core.ops import get_op_result_name
import pandas.core.tools.datetimes as tools
@@ -56,42 +69,31 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin):
# Some are "raw" methods, the result is not not re-boxed in an Index
# We also have a few "extra" attrs, which may or may not be raw,
# which we we dont' want to expose in the .dt accessor.
- _extra_methods = [
- 'to_period',
- 'to_perioddelta',
- 'to_julian_date',
- ]
- _extra_raw_methods = [
- 'to_pydatetime',
- '_local_timestamps',
- '_has_same_tz',
- ]
- _extra_raw_properties = [
- '_box_func',
- 'tz', 'tzinfo',
- ]
- _delegated_properties = (
- DatetimeArray._datetimelike_ops + _extra_raw_properties
- )
+ _extra_methods = ["to_period", "to_perioddelta", "to_julian_date"]
+ _extra_raw_methods = ["to_pydatetime", "_local_timestamps", "_has_same_tz"]
+ _extra_raw_properties = ["_box_func", "tz", "tzinfo"]
+ _delegated_properties = DatetimeArray._datetimelike_ops + _extra_raw_properties
_delegated_methods = (
- DatetimeArray._datetimelike_methods + _extra_methods +
- _extra_raw_methods
+ DatetimeArray._datetimelike_methods + _extra_methods + _extra_raw_methods
+ )
+ _raw_properties = (
+ {"date", "time", "timetz"}
+ | set(DatetimeArray._bool_ops)
+ | set(_extra_raw_properties)
)
- _raw_properties = {
- 'date',
- 'time',
- 'timetz',
- } | set(DatetimeArray._bool_ops) | set(_extra_raw_properties)
_raw_methods = set(_extra_raw_methods)
_delegate_class = DatetimeArray
-@delegate_names(DatetimeArray,
- DatetimeDelegateMixin._delegated_properties,
- typ="property")
-@delegate_names(DatetimeArray,
- DatetimeDelegateMixin._delegated_methods,
- typ="method", overwrite=False)
+@delegate_names(
+ DatetimeArray, DatetimeDelegateMixin._delegated_properties, typ="property"
+)
+@delegate_names(
+ DatetimeArray,
+ DatetimeDelegateMixin._delegated_methods,
+ typ="method",
+ overwrite=False,
+)
class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin):
"""
Immutable ndarray of datetime64 data, represented internally as int64, and
@@ -221,25 +223,26 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin):
Creating a DatetimeIndex based on `start`, `periods`, and `end` has
been deprecated in favor of :func:`date_range`.
"""
- _typ = 'datetimeindex'
+
+ _typ = "datetimeindex"
_join_precedence = 10
def _join_i8_wrapper(joinf, **kwargs):
- return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype='M8[ns]',
- **kwargs)
+ return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="M8[ns]", **kwargs)
_inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64)
_outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64)
_left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64)
_left_indexer_unique = _join_i8_wrapper(
- libjoin.left_join_indexer_unique_int64, with_indexers=False)
+ libjoin.left_join_indexer_unique_int64, with_indexers=False
+ )
_engine_type = libindex.DatetimeEngine
_tz = None
_freq = None
- _comparables = ['name', 'freqstr', 'tz']
- _attributes = ['name', 'tz', 'freq']
+ _comparables = ["name", "freqstr", "tz"]
+ _attributes = ["name", "tz", "freq"]
_is_numeric_dtype = False
_infer_as_myclass = True
@@ -256,48 +259,81 @@ def _join_i8_wrapper(joinf, **kwargs):
# --------------------------------------------------------------------
# Constructors
- def __new__(cls, data=None,
- freq=None, start=None, end=None, periods=None, tz=None,
- normalize=False, closed=None, ambiguous='raise',
- dayfirst=False, yearfirst=False, dtype=None,
- copy=False, name=None, verify_integrity=None):
+ def __new__(
+ cls,
+ data=None,
+ freq=None,
+ start=None,
+ end=None,
+ periods=None,
+ tz=None,
+ normalize=False,
+ closed=None,
+ ambiguous="raise",
+ dayfirst=False,
+ yearfirst=False,
+ dtype=None,
+ copy=False,
+ name=None,
+ verify_integrity=None,
+ ):
if verify_integrity is not None:
- warnings.warn("The 'verify_integrity' argument is deprecated, "
- "will be removed in a future version.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "The 'verify_integrity' argument is deprecated, "
+ "will be removed in a future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
else:
verify_integrity = True
if data is None:
dtarr = DatetimeArray._generate_range(
- start, end, periods,
- freq=freq, tz=tz, normalize=normalize,
- closed=closed, ambiguous=ambiguous)
- warnings.warn("Creating a DatetimeIndex by passing range "
- "endpoints is deprecated. Use "
- "`pandas.date_range` instead.",
- FutureWarning, stacklevel=2)
- return cls._simple_new(
- dtarr._data, freq=dtarr.freq, tz=dtarr.tz, name=name)
+ start,
+ end,
+ periods,
+ freq=freq,
+ tz=tz,
+ normalize=normalize,
+ closed=closed,
+ ambiguous=ambiguous,
+ )
+ warnings.warn(
+ "Creating a DatetimeIndex by passing range "
+ "endpoints is deprecated. Use "
+ "`pandas.date_range` instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
+ return cls._simple_new(dtarr._data, freq=dtarr.freq, tz=dtarr.tz, name=name)
if is_scalar(data):
- raise TypeError("{cls}() must be called with a "
- "collection of some kind, {data} was passed"
- .format(cls=cls.__name__, data=repr(data)))
+ raise TypeError(
+ "{cls}() must be called with a "
+ "collection of some kind, {data} was passed".format(
+ cls=cls.__name__, data=repr(data)
+ )
+ )
# - Cases checked above all return/raise before reaching here - #
- if name is None and hasattr(data, 'name'):
+ if name is None and hasattr(data, "name"):
name = data.name
dtarr = DatetimeArray._from_sequence(
- data, dtype=dtype, copy=copy, tz=tz, freq=freq,
- dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous,
- int_as_wall_time=True)
-
- subarr = cls._simple_new(dtarr, name=name,
- freq=dtarr.freq, tz=dtarr.tz)
+ data,
+ dtype=dtype,
+ copy=copy,
+ tz=tz,
+ freq=freq,
+ dayfirst=dayfirst,
+ yearfirst=yearfirst,
+ ambiguous=ambiguous,
+ int_as_wall_time=True,
+ )
+
+ subarr = cls._simple_new(dtarr, name=name, freq=dtarr.freq, tz=dtarr.tz)
return subarr
@classmethod
@@ -337,8 +373,11 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None):
# --------------------------------------------------------------------
def __array__(self, dtype=None):
- if (dtype is None and isinstance(self._data, DatetimeArray)
- and getattr(self.dtype, 'tz', None)):
+ if (
+ dtype is None
+ and isinstance(self._data, DatetimeArray)
+ and getattr(self.dtype, "tz", None)
+ ):
msg = (
"Converting timezone-aware DatetimeArray to timezone-naive "
"ndarray with 'datetime64[ns]' dtype. In the future, this "
@@ -348,7 +387,7 @@ def __array__(self, dtype=None):
"To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'."
)
warnings.warn(msg, FutureWarning, stacklevel=3)
- dtype = 'M8[ns]'
+ dtype = "M8[ns]"
return np.asarray(self._data, dtype=dtype)
@property
@@ -363,8 +402,10 @@ def tz(self):
@tz.setter
def tz(self, value):
# GH 3746: Prevent localizing or converting the index by setting tz
- raise AttributeError("Cannot directly set timezone. Use tz_localize() "
- "or tz_convert() as appropriate")
+ raise AttributeError(
+ "Cannot directly set timezone. Use tz_localize() "
+ "or tz_convert() as appropriate"
+ )
tzinfo = tz
@@ -372,6 +413,7 @@ def tz(self, value):
def _is_dates_only(self):
"""Return a boolean if we are only dates (and don't have a timezone)"""
from pandas.io.formats.format import _is_dates_only
+
return _is_dates_only(self.values) and self.tz is None
def __reduce__(self):
@@ -413,20 +455,21 @@ def __setstate__(self, state):
else:
raise Exception("invalid pickle state")
+
_unpickle_compat = __setstate__
def _convert_for_op(self, value):
""" Convert value to be insertable to ndarray """
if self._has_same_tz(value):
return _to_M8(value)
- raise ValueError('Passed item and index have different timezone')
+ raise ValueError("Passed item and index have different timezone")
def _maybe_update_attributes(self, attrs):
""" Update Index attributes (e.g. freq) depending on op """
- freq = attrs.get('freq', None)
+ freq = attrs.get("freq", None)
if freq is not None:
# no need to infer if freq is None
- attrs['freq'] = 'infer'
+ attrs["freq"] = "infer"
return attrs
# --------------------------------------------------------------------
@@ -436,18 +479,19 @@ def _mpl_repr(self):
# how to represent ourselves to matplotlib
return libts.ints_to_pydatetime(self.asi8, self.tz)
- def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs):
+ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs):
from pandas.io.formats.format import _get_format_datetime64_from_values
+
fmt = _get_format_datetime64_from_values(self, date_format)
- return libts.format_array_from_datetime(self.asi8,
- tz=self.tz,
- format=fmt,
- na_rep=na_rep)
+ return libts.format_array_from_datetime(
+ self.asi8, tz=self.tz, format=fmt, na_rep=na_rep
+ )
@property
def _formatter_func(self):
from pandas.io.formats.format import _get_format_datetime64
+
formatter = _get_format_datetime64(is_dates_only=self._is_dates_only)
return lambda x: "'%s'" % formatter(x, tz=self.tz)
@@ -477,8 +521,9 @@ def _union(self, other, sort):
# TODO: we shouldn't be setting attributes like this;
# in all the tests this equality already holds
result._data._dtype = this.dtype
- if (result.freq is None and
- (this.freq is not None or other.freq is not None)):
+ if result.freq is None and (
+ this.freq is not None or other.freq is not None
+ ):
result.freq = to_offset(result.inferred_freq)
return result
@@ -561,7 +606,7 @@ def _fast_union(self, other, sort=None):
elif sort is False:
left, right = self, other
left_start = left[0]
- loc = right.searchsorted(left_start, side='left')
+ loc = right.searchsorted(left_start, side="left")
right_chunk = right.values[:loc]
dates = _concat._concat_compat((left.values, right_chunk))
return self._shallow_copy(dates)
@@ -577,7 +622,7 @@ def _fast_union(self, other, sort=None):
# concatenate dates
if left_end < right_end:
- loc = right.searchsorted(left_end, side='right')
+ loc = right.searchsorted(left_end, side="right")
right_chunk = right.values[loc:]
dates = _concat._concat_compat((left.values, right_chunk))
return self._shallow_copy(dates)
@@ -666,18 +711,26 @@ def to_series(self, keep_tz=None, index=None, name=None):
name = self.name
if keep_tz is None and self.tz is not None:
- warnings.warn("The default of the 'keep_tz' keyword in "
- "DatetimeIndex.to_series will change "
- "to True in a future release. You can set "
- "'keep_tz=True' to obtain the future behaviour and "
- "silence this warning.", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "The default of the 'keep_tz' keyword in "
+ "DatetimeIndex.to_series will change "
+ "to True in a future release. You can set "
+ "'keep_tz=True' to obtain the future behaviour and "
+ "silence this warning.",
+ FutureWarning,
+ stacklevel=2,
+ )
keep_tz = False
elif keep_tz is False:
- warnings.warn("Specifying 'keep_tz=False' is deprecated and this "
- "option will be removed in a future release. If "
- "you want to remove the timezone information, you "
- "can do 'idx.tz_convert(None)' before calling "
- "'to_series'.", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "Specifying 'keep_tz=False' is deprecated and this "
+ "option will be removed in a future release. If "
+ "you want to remove the timezone information, you "
+ "can do 'idx.tz_convert(None)' before calling "
+ "'to_series'.",
+ FutureWarning,
+ stacklevel=2,
+ )
if keep_tz and self.tz is not None:
# preserve the tz & copy
@@ -687,7 +740,7 @@ def to_series(self, keep_tz=None, index=None, name=None):
return Series(values, index=index, name=name)
- def snap(self, freq='S'):
+ def snap(self, freq="S"):
"""
Snap time stamps to nearest occurring frequency
@@ -712,52 +765,67 @@ def snap(self, freq='S'):
snapped[i] = s
# we know it conforms; skip check
- return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz,
- freq=freq)
+ return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz, freq=freq)
- def join(self, other, how='left', level=None, return_indexers=False,
- sort=False):
+ def join(self, other, how="left", level=None, return_indexers=False, sort=False):
"""
See Index.join
"""
- if (not isinstance(other, DatetimeIndex) and len(other) > 0 and
- other.inferred_type not in ('floating', 'integer', 'mixed-integer',
- 'mixed-integer-float', 'mixed')):
+ if (
+ not isinstance(other, DatetimeIndex)
+ and len(other) > 0
+ and other.inferred_type
+ not in (
+ "floating",
+ "integer",
+ "mixed-integer",
+ "mixed-integer-float",
+ "mixed",
+ )
+ ):
try:
other = DatetimeIndex(other)
except (TypeError, ValueError):
pass
this, other = self._maybe_utc_convert(other)
- return Index.join(this, other, how=how, level=level,
- return_indexers=return_indexers, sort=sort)
+ return Index.join(
+ this,
+ other,
+ how=how,
+ level=level,
+ return_indexers=return_indexers,
+ sort=sort,
+ )
def _maybe_utc_convert(self, other):
this = self
if isinstance(other, DatetimeIndex):
if self.tz is not None:
if other.tz is None:
- raise TypeError('Cannot join tz-naive with tz-aware '
- 'DatetimeIndex')
+ raise TypeError(
+ "Cannot join tz-naive with tz-aware " "DatetimeIndex"
+ )
elif other.tz is not None:
- raise TypeError('Cannot join tz-naive with tz-aware '
- 'DatetimeIndex')
+ raise TypeError("Cannot join tz-naive with tz-aware " "DatetimeIndex")
if not timezones.tz_compare(self.tz, other.tz):
- this = self.tz_convert('UTC')
- other = other.tz_convert('UTC')
+ this = self.tz_convert("UTC")
+ other = other.tz_convert("UTC")
return this, other
def _wrap_joined_index(self, joined, other):
name = get_op_result_name(self, other)
- if (isinstance(other, DatetimeIndex) and
- self.freq == other.freq and
- self._can_fast_union(other)):
+ if (
+ isinstance(other, DatetimeIndex)
+ and self.freq == other.freq
+ and self._can_fast_union(other)
+ ):
joined = self._shallow_copy(joined)
joined.name = name
return joined
else:
- tz = getattr(other, 'tz', None)
+ tz = getattr(other, "tz", None)
return self._simple_new(joined, name, tz=tz)
def _parsed_string_to_bounds(self, reso, parsed):
@@ -776,41 +844,63 @@ def _parsed_string_to_bounds(self, reso, parsed):
lower, upper: pd.Timestamp
"""
- valid_resos = {'year', 'month', 'quarter', 'day', 'hour', 'minute',
- 'second', 'minute', 'second', 'microsecond'}
+ valid_resos = {
+ "year",
+ "month",
+ "quarter",
+ "day",
+ "hour",
+ "minute",
+ "second",
+ "minute",
+ "second",
+ "microsecond",
+ }
if reso not in valid_resos:
raise KeyError
- if reso == 'year':
+ if reso == "year":
start = Timestamp(parsed.year, 1, 1)
end = Timestamp(parsed.year, 12, 31, 23, 59, 59, 999999)
- elif reso == 'month':
+ elif reso == "month":
d = ccalendar.get_days_in_month(parsed.year, parsed.month)
start = Timestamp(parsed.year, parsed.month, 1)
end = Timestamp(parsed.year, parsed.month, d, 23, 59, 59, 999999)
- elif reso == 'quarter':
+ elif reso == "quarter":
qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead
d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month
start = Timestamp(parsed.year, parsed.month, 1)
end = Timestamp(parsed.year, qe, d, 23, 59, 59, 999999)
- elif reso == 'day':
+ elif reso == "day":
start = Timestamp(parsed.year, parsed.month, parsed.day)
end = start + timedelta(days=1) - Nano(1)
- elif reso == 'hour':
- start = Timestamp(parsed.year, parsed.month, parsed.day,
- parsed.hour)
+ elif reso == "hour":
+ start = Timestamp(parsed.year, parsed.month, parsed.day, parsed.hour)
end = start + timedelta(hours=1) - Nano(1)
- elif reso == 'minute':
- start = Timestamp(parsed.year, parsed.month, parsed.day,
- parsed.hour, parsed.minute)
+ elif reso == "minute":
+ start = Timestamp(
+ parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute
+ )
end = start + timedelta(minutes=1) - Nano(1)
- elif reso == 'second':
- start = Timestamp(parsed.year, parsed.month, parsed.day,
- parsed.hour, parsed.minute, parsed.second)
+ elif reso == "second":
+ start = Timestamp(
+ parsed.year,
+ parsed.month,
+ parsed.day,
+ parsed.hour,
+ parsed.minute,
+ parsed.second,
+ )
end = start + timedelta(seconds=1) - Nano(1)
- elif reso == 'microsecond':
- start = Timestamp(parsed.year, parsed.month, parsed.day,
- parsed.hour, parsed.minute, parsed.second,
- parsed.microsecond)
+ elif reso == "microsecond":
+ start = Timestamp(
+ parsed.year,
+ parsed.month,
+ parsed.day,
+ parsed.hour,
+ parsed.minute,
+ parsed.second,
+ parsed.microsecond,
+ )
end = start + timedelta(microseconds=1) - Nano(1)
# GH 24076
# If an incoming date string contained a UTC offset, need to localize
@@ -818,9 +908,11 @@ def _parsed_string_to_bounds(self, reso, parsed):
# timezone
if parsed.tzinfo is not None:
if self.tz is None:
- raise ValueError("The index must be timezone aware "
- "when indexing with a date string with a "
- "UTC offset")
+ raise ValueError(
+ "The index must be timezone aware "
+ "when indexing with a date string with a "
+ "UTC offset"
+ )
start = start.tz_localize(parsed.tzinfo).tz_convert(self.tz)
end = end.tz_localize(parsed.tzinfo).tz_convert(self.tz)
elif self.tz is not None:
@@ -830,15 +922,18 @@ def _parsed_string_to_bounds(self, reso, parsed):
def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True):
is_monotonic = self.is_monotonic
- if (is_monotonic and reso in ['day', 'hour', 'minute', 'second'] and
- self._resolution >= Resolution.get_reso(reso)):
+ if (
+ is_monotonic
+ and reso in ["day", "hour", "minute", "second"]
+ and self._resolution >= Resolution.get_reso(reso)
+ ):
# These resolution/monotonicity validations came from GH3931,
# GH3452 and GH2369.
# See also GH14826
raise KeyError
- if reso == 'microsecond':
+ if reso == "microsecond":
# _partial_date_slice doesn't allow microsecond resolution, but
# _parsed_string_to_bounds allows it.
raise KeyError
@@ -849,17 +944,15 @@ def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True):
if is_monotonic:
# we are out of range
- if (len(stamps) and ((use_lhs and t1.value < stamps[0] and
- t2.value < stamps[0]) or
- ((use_rhs and t1.value > stamps[-1] and
- t2.value > stamps[-1])))):
+ if len(stamps) and (
+ (use_lhs and t1.value < stamps[0] and t2.value < stamps[0])
+ or ((use_rhs and t1.value > stamps[-1] and t2.value > stamps[-1]))
+ ):
raise KeyError
# a monotonic (sorted) series can be sliced
- left = stamps.searchsorted(
- t1.value, side='left') if use_lhs else None
- right = stamps.searchsorted(
- t2.value, side='right') if use_rhs else None
+ left = stamps.searchsorted(t1.value, side="left") if use_lhs else None
+ right = stamps.searchsorted(t2.value, side="right") if use_rhs else None
return slice(left, right)
@@ -870,7 +963,7 @@ def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True):
return (lhs_mask & rhs_mask).nonzero()[0]
def _maybe_promote(self, other):
- if other.inferred_type == 'date':
+ if other.inferred_type == "date":
other = DatetimeIndex(other)
return self, other
@@ -896,8 +989,7 @@ def get_value(self, series, key):
return series.take(locs)
try:
- return com.maybe_box(self, Index.get_value(self, series, key),
- series, key)
+ return com.maybe_box(self, Index.get_value(self, series, key), series, key)
except KeyError:
try:
loc = self._get_string_slice(key)
@@ -920,8 +1012,7 @@ def get_value_maybe_box(self, series, key):
key = key.tz_localize(self.tz)
elif not isinstance(key, Timestamp):
key = Timestamp(key)
- values = self._engine.get_value(com.values_from_object(series),
- key, tz=self.tz)
+ values = self._engine.get_value(com.values_from_object(series), key, tz=self.tz)
return com.maybe_box(self, values, series, key)
def get_loc(self, key, method=None, tolerance=None):
@@ -948,14 +1039,17 @@ def get_loc(self, key, method=None, tolerance=None):
elif isinstance(key, timedelta):
# GH#20464
- raise TypeError("Cannot index {cls} with {other}"
- .format(cls=type(self).__name__,
- other=type(key).__name__))
+ raise TypeError(
+ "Cannot index {cls} with {other}".format(
+ cls=type(self).__name__, other=type(key).__name__
+ )
+ )
if isinstance(key, time):
if method is not None:
- raise NotImplementedError('cannot yet lookup inexact labels '
- 'when key is a time object')
+ raise NotImplementedError(
+ "cannot yet lookup inexact labels " "when key is a time object"
+ )
return self.indexer_at_time(key)
try:
@@ -977,7 +1071,7 @@ def get_loc(self, key, method=None, tolerance=None):
raise KeyError(key)
except ValueError as e:
# list-like tolerance size must match target index size
- if 'list-like' in str(e):
+ if "list-like" in str(e):
raise e
raise KeyError(key)
@@ -1000,14 +1094,13 @@ def _maybe_cast_slice_bound(self, label, side, kind):
Value of `side` parameter should be validated in caller.
"""
- assert kind in ['ix', 'loc', 'getitem', None]
+ assert kind in ["ix", "loc", "getitem", None]
if is_float(label) or isinstance(label, time) or is_integer(label):
- self._invalid_indexer('slice', label)
+ self._invalid_indexer("slice", label)
if isinstance(label, str):
- freq = getattr(self, 'freqstr',
- getattr(self, 'inferred_freq', None))
+ freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None))
_, parsed, reso = parsing.parse_time_string(label, freq)
lower, upper = self._parsed_string_to_bounds(reso, parsed)
# lower, upper form the half-open interval:
@@ -1017,17 +1110,15 @@ def _maybe_cast_slice_bound(self, label, side, kind):
# length > 1 (is_monotonic_decreasing gives True for empty
# and length 1 index)
if self._is_strictly_monotonic_decreasing and len(self) > 1:
- return upper if side == 'left' else lower
- return lower if side == 'left' else upper
+ return upper if side == "left" else lower
+ return lower if side == "left" else upper
else:
return label
def _get_string_slice(self, key, use_lhs=True, use_rhs=True):
- freq = getattr(self, 'freqstr',
- getattr(self, 'inferred_freq', None))
+ freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None))
_, parsed, reso = parsing.parse_time_string(key, freq)
- loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs,
- use_rhs=use_rhs)
+ loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs)
return loc
def slice_indexer(self, start=None, end=None, step=None, kind=None):
@@ -1049,11 +1140,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
# an array of (self.hour, self.minute, self.seconds, self.microsecond).
if isinstance(start, time) and isinstance(end, time):
if step is not None and step != 1:
- raise ValueError('Must have step size of 1 with time slices')
+ raise ValueError("Must have step size of 1 with time slices")
return self.indexer_between_time(start, end)
if isinstance(start, time) or isinstance(end, time):
- raise KeyError('Cannot mix time and non-time slice keys')
+ raise KeyError("Cannot mix time and non-time slice keys")
try:
return Index.slice_indexer(self, start, end, step, kind=kind)
@@ -1061,17 +1152,16 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
# For historical reasons DatetimeIndex by default supports
# value-based partial (aka string) slices on non-monotonic arrays,
# let's try that.
- if ((start is None or isinstance(start, str)) and
- (end is None or isinstance(end, str))):
+ if (start is None or isinstance(start, str)) and (
+ end is None or isinstance(end, str)
+ ):
mask = True
if start is not None:
- start_casted = self._maybe_cast_slice_bound(
- start, 'left', kind)
+ start_casted = self._maybe_cast_slice_bound(start, "left", kind)
mask = start_casted <= self
if end is not None:
- end_casted = self._maybe_cast_slice_bound(
- end, 'right', kind)
+ end_casted = self._maybe_cast_slice_bound(end, "right", kind)
mask = (self <= end_casted) & mask
indexer = mask.nonzero()[0][::step]
@@ -1091,10 +1181,8 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
_is_unique = Index.is_unique
_timezone = cache_readonly(DatetimeArray._timezone.fget) # type: ignore
- is_normalized = cache_readonly(
- DatetimeArray.is_normalized.fget) # type: ignore
- _resolution = cache_readonly(
- DatetimeArray._resolution.fget) # type: ignore
+ is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) # type: ignore
+ _resolution = cache_readonly(DatetimeArray._resolution.fget) # type: ignore
strftime = ea_passthrough(DatetimeArray.strftime)
_has_same_tz = ea_passthrough(DatetimeArray._has_same_tz)
@@ -1104,9 +1192,12 @@ def offset(self):
"""
get/set the frequency of the instance
"""
- msg = ('{cls}.offset has been deprecated and will be removed '
- 'in a future version; use {cls}.freq instead.'
- .format(cls=type(self).__name__))
+ msg = (
+ "{cls}.offset has been deprecated and will be removed "
+ "in a future version; use {cls}.freq instead.".format(
+ cls=type(self).__name__
+ )
+ )
warnings.warn(msg, FutureWarning, stacklevel=2)
return self.freq
@@ -1115,9 +1206,12 @@ def offset(self, value):
"""
get/set the frequency of the instance
"""
- msg = ('{cls}.offset has been deprecated and will be removed '
- 'in a future version; use {cls}.freq instead.'
- .format(cls=type(self).__name__))
+ msg = (
+ "{cls}.offset has been deprecated and will be removed "
+ "in a future version; use {cls}.freq instead.".format(
+ cls=type(self).__name__
+ )
+ )
warnings.warn(msg, FutureWarning, stacklevel=2)
self.freq = value
@@ -1138,9 +1232,9 @@ def _box_func(self):
# --------------------------------------------------------------------
- @Substitution(klass='DatetimeIndex')
- @Appender(_shared_docs['searchsorted'])
- def searchsorted(self, value, side='left', sorter=None):
+ @Substitution(klass="DatetimeIndex")
+ @Appender(_shared_docs["searchsorted"])
+ def searchsorted(self, value, side="left", sorter=None):
if isinstance(value, (np.ndarray, Index)):
value = np.array(value, dtype=_NS_DTYPE, copy=False)
else:
@@ -1149,13 +1243,13 @@ def searchsorted(self, value, side='left', sorter=None):
return self.values.searchsorted(value, side=side)
def is_type_compatible(self, typ):
- return typ == self.inferred_type or typ == 'datetime'
+ return typ == self.inferred_type or typ == "datetime"
@property
def inferred_type(self):
# b/c datetime is represented as microseconds since the epoch, make
# sure we can't have ambiguous indexing
- return 'datetime64'
+ return "datetime64"
@property
def is_all_dates(self):
@@ -1185,28 +1279,26 @@ def insert(self, loc, item):
if isinstance(item, (datetime, np.datetime64)):
self._assert_can_do_op(item)
if not self._has_same_tz(item) and not isna(item):
- raise ValueError(
- 'Passed item and index have different timezone')
+ raise ValueError("Passed item and index have different timezone")
# check freq can be preserved on edge cases
if self.size and self.freq is not None:
- if ((loc == 0 or loc == -len(self)) and
- item + self.freq == self[0]):
+ if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]:
freq = self.freq
elif (loc == len(self)) and item - self.freq == self[-1]:
freq = self.freq
item = _to_M8(item, tz=self.tz)
try:
- new_dates = np.concatenate((self[:loc].asi8, [item.view(np.int64)],
- self[loc:].asi8))
+ new_dates = np.concatenate(
+ (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8)
+ )
return self._shallow_copy(new_dates, freq=freq)
except (AttributeError, TypeError):
# fall back to object index
if isinstance(item, str):
return self.astype(object).insert(loc, item)
- raise TypeError(
- "cannot insert DatetimeIndex with incompatible label")
+ raise TypeError("cannot insert DatetimeIndex with incompatible label")
def delete(self, loc):
"""
@@ -1229,10 +1321,9 @@ def delete(self, loc):
freq = self.freq
else:
if is_list_like(loc):
- loc = lib.maybe_indices_to_slice(
- ensure_int64(np.array(loc)), len(self))
+ loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self))
if isinstance(loc, slice) and loc.step in (1, None):
- if (loc.start in (0, None) or loc.stop in (len(self), None)):
+ if loc.start in (0, None) or loc.stop in (len(self), None):
freq = self.freq
return self._shallow_copy(new_dates, freq=freq)
@@ -1262,6 +1353,7 @@ def indexer_at_time(self, time, asof=False):
if isinstance(time, str):
from dateutil.parser import parse
+
time = parse(time).time()
if time.tzinfo:
@@ -1273,8 +1365,9 @@ def indexer_at_time(self, time, asof=False):
micros = _time_to_micros(time)
return (micros == time_micros).nonzero()[0]
- def indexer_between_time(self, start_time, end_time, include_start=True,
- include_end=True):
+ def indexer_between_time(
+ self, start_time, end_time, include_start=True, include_end=True
+ ):
"""
Return index locations of values between particular times of day
(e.g., 9:00-9:30AM).
@@ -1318,8 +1411,7 @@ def indexer_between_time(self, start_time, end_time, include_start=True,
else:
join_op = operator.or_
- mask = join_op(lop(start_micros, time_micros),
- rop(time_micros, end_micros))
+ mask = join_op(lop(start_micros, time_micros), rop(time_micros, end_micros))
return mask.nonzero()[0]
@@ -1330,8 +1422,17 @@ def indexer_between_time(self, start_time, end_time, include_start=True,
DatetimeIndex._add_datetimelike_methods()
-def date_range(start=None, end=None, periods=None, freq=None, tz=None,
- normalize=False, name=None, closed=None, **kwargs):
+def date_range(
+ start=None,
+ end=None,
+ periods=None,
+ freq=None,
+ tz=None,
+ normalize=False,
+ name=None,
+ closed=None,
+ **kwargs
+):
"""
Return a fixed frequency DatetimeIndex.
@@ -1470,19 +1571,34 @@ def date_range(start=None, end=None, periods=None, freq=None, tz=None,
"""
if freq is None and com._any_none(periods, start, end):
- freq = 'D'
+ freq = "D"
dtarr = DatetimeArray._generate_range(
- start=start, end=end, periods=periods,
- freq=freq, tz=tz, normalize=normalize,
- closed=closed, **kwargs)
- return DatetimeIndex._simple_new(
- dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name)
-
-
-def bdate_range(start=None, end=None, periods=None, freq='B', tz=None,
- normalize=True, name=None, weekmask=None, holidays=None,
- closed=None, **kwargs):
+ start=start,
+ end=end,
+ periods=periods,
+ freq=freq,
+ tz=tz,
+ normalize=normalize,
+ closed=closed,
+ **kwargs
+ )
+ return DatetimeIndex._simple_new(dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name)
+
+
+def bdate_range(
+ start=None,
+ end=None,
+ periods=None,
+ freq="B",
+ tz=None,
+ normalize=True,
+ name=None,
+ weekmask=None,
+ holidays=None,
+ closed=None,
+ **kwargs
+):
"""
Return a fixed frequency DatetimeIndex, with business day as the default
frequency
@@ -1548,24 +1664,34 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None,
dtype='datetime64[ns]', freq='B')
"""
if freq is None:
- msg = 'freq must be specified for bdate_range; use date_range instead'
+ msg = "freq must be specified for bdate_range; use date_range instead"
raise TypeError(msg)
- if is_string_like(freq) and freq.startswith('C'):
+ if is_string_like(freq) and freq.startswith("C"):
try:
- weekmask = weekmask or 'Mon Tue Wed Thu Fri'
+ weekmask = weekmask or "Mon Tue Wed Thu Fri"
freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask)
except (KeyError, TypeError):
- msg = 'invalid custom frequency string: {freq}'.format(freq=freq)
+ msg = "invalid custom frequency string: {freq}".format(freq=freq)
raise ValueError(msg)
elif holidays or weekmask:
- msg = ('a custom frequency string is required when holidays or '
- 'weekmask are passed, got frequency {freq}').format(freq=freq)
+ msg = (
+ "a custom frequency string is required when holidays or "
+ "weekmask are passed, got frequency {freq}"
+ ).format(freq=freq)
raise ValueError(msg)
- return date_range(start=start, end=end, periods=periods,
- freq=freq, tz=tz, normalize=normalize, name=name,
- closed=closed, **kwargs)
+ return date_range(
+ start=start,
+ end=end,
+ periods=periods,
+ freq=freq,
+ tz=tz,
+ normalize=normalize,
+ name=name,
+ closed=closed,
+ **kwargs
+ )
def _time_to_micros(time):
diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py
index aeb0fa119ab33..2e5b3ff8ef502 100644
--- a/pandas/core/indexes/frozen.py
+++ b/pandas/core/indexes/frozen.py
@@ -28,6 +28,7 @@ class FrozenList(PandasObject, list):
because it's technically non-hashable, will be used
for lookups, appropriately, etc.
"""
+
# Side note: This has to be of type list. Otherwise,
# it messes up PyTables type checks.
@@ -105,16 +106,15 @@ def __hash__(self):
def _disabled(self, *args, **kwargs):
"""This method will not function because object is immutable."""
- raise TypeError("'%s' does not support mutable operations." %
- self.__class__.__name__)
+ raise TypeError(
+ "'%s' does not support mutable operations." % self.__class__.__name__
+ )
def __str__(self):
- return pprint_thing(self, quote_strings=True,
- escape_chars=('\t', '\r', '\n'))
+ return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n"))
def __repr__(self):
- return "%s(%s)" % (self.__class__.__name__,
- str(self))
+ return "%s(%s)" % (self.__class__.__name__, str(self))
__setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled
pop = append = extend = remove = sort = insert = _disabled
@@ -124,9 +124,12 @@ class FrozenNDArray(PandasObject, np.ndarray):
# no __array_finalize__ for now because no metadata
def __new__(cls, data, dtype=None, copy=False):
- warnings.warn("\nFrozenNDArray is deprecated and will be removed in a "
- "future version.\nPlease use `numpy.ndarray` instead.\n",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "\nFrozenNDArray is deprecated and will be removed in a "
+ "future version.\nPlease use `numpy.ndarray` instead.\n",
+ FutureWarning,
+ stacklevel=2,
+ )
if copy is None:
copy = not isinstance(data, FrozenNDArray)
@@ -135,8 +138,7 @@ def __new__(cls, data, dtype=None, copy=False):
def _disabled(self, *args, **kwargs):
"""This method will not function because object is immutable."""
- raise TypeError("'%s' does not support mutable operations." %
- self.__class__)
+ raise TypeError("'%s' does not support mutable operations." % self.__class__)
__setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled
put = itemset = fill = _disabled
@@ -153,8 +155,7 @@ def __repr__(self):
"""
Return a string representation for this object.
"""
- prepr = pprint_thing(self, escape_chars=('\t', '\r', '\n'),
- quote_strings=True)
+ prepr = pprint_thing(self, escape_chars=("\t", "\r", "\n"), quote_strings=True)
return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype)
@deprecate_kwarg(old_arg_name="v", new_arg_name="value")
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 83bc5963f4f9e..b14cff8cc6ade 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -14,11 +14,25 @@
from pandas.util._exceptions import rewrite_exception
from pandas.core.dtypes.cast import (
- find_common_type, infer_dtype_from_scalar, maybe_downcast_to_dtype)
+ find_common_type,
+ infer_dtype_from_scalar,
+ maybe_downcast_to_dtype,
+)
from pandas.core.dtypes.common import (
- ensure_platform_int, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype,
- is_dtype_equal, is_float, is_float_dtype, is_integer, is_integer_dtype,
- is_interval_dtype, is_list_like, is_number, is_object_dtype, is_scalar)
+ ensure_platform_int,
+ is_datetime64tz_dtype,
+ is_datetime_or_timedelta_dtype,
+ is_dtype_equal,
+ is_float,
+ is_float_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_interval_dtype,
+ is_list_like,
+ is_number,
+ is_object_dtype,
+ is_scalar,
+)
from pandas.core.dtypes.generic import ABCSeries
from pandas.core.dtypes.missing import isna
@@ -27,7 +41,12 @@
import pandas.core.common as com
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import (
- Index, InvalidIndexError, _index_shared_docs, default_pprint, ensure_index)
+ Index,
+ InvalidIndexError,
+ _index_shared_docs,
+ default_pprint,
+ ensure_index,
+)
from pandas.core.indexes.datetimes import DatetimeIndex, date_range
from pandas.core.indexes.multi import MultiIndex
from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range
@@ -36,48 +55,54 @@
from pandas.tseries.frequencies import to_offset
from pandas.tseries.offsets import DateOffset
-_VALID_CLOSED = {'left', 'right', 'both', 'neither'}
+_VALID_CLOSED = {"left", "right", "both", "neither"}
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
_index_doc_kwargs.update(
- dict(klass='IntervalIndex',
- qualname="IntervalIndex",
- target_klass='IntervalIndex or list of Intervals',
- name=textwrap.dedent("""\
+ dict(
+ klass="IntervalIndex",
+ qualname="IntervalIndex",
+ target_klass="IntervalIndex or list of Intervals",
+ name=textwrap.dedent(
+ """\
name : object, optional
Name to be stored in the index.
- """),
- ))
+ """
+ ),
+ )
+)
def _get_next_label(label):
- dtype = getattr(label, 'dtype', type(label))
+ dtype = getattr(label, "dtype", type(label))
if isinstance(label, (Timestamp, Timedelta)):
- dtype = 'datetime64'
+ dtype = "datetime64"
if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype):
- return label + np.timedelta64(1, 'ns')
+ return label + np.timedelta64(1, "ns")
elif is_integer_dtype(dtype):
return label + 1
elif is_float_dtype(dtype):
return np.nextafter(label, np.infty)
else:
- raise TypeError('cannot determine next label for type {typ!r}'
- .format(typ=type(label)))
+ raise TypeError(
+ "cannot determine next label for type {typ!r}".format(typ=type(label))
+ )
def _get_prev_label(label):
- dtype = getattr(label, 'dtype', type(label))
+ dtype = getattr(label, "dtype", type(label))
if isinstance(label, (Timestamp, Timedelta)):
- dtype = 'datetime64'
+ dtype = "datetime64"
if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype):
- return label - np.timedelta64(1, 'ns')
+ return label - np.timedelta64(1, "ns")
elif is_integer_dtype(dtype):
return label - 1
elif is_float_dtype(dtype):
return np.nextafter(label, -np.infty)
else:
- raise TypeError('cannot determine next label for type {typ!r}'
- .format(typ=type(label)))
+ raise TypeError(
+ "cannot determine next label for type {typ!r}".format(typ=type(label))
+ )
def _get_interval_closed_bounds(interval):
@@ -106,6 +131,7 @@ class SetopCheck:
This is called to decorate the set operations of IntervalIndex
to perform the type check in advance.
"""
+
def __init__(self, op_name):
self.op_name = op_name
@@ -115,36 +141,43 @@ def func(intvidx_self, other, sort=False):
other = ensure_index(other)
if not isinstance(other, IntervalIndex):
- result = getattr(intvidx_self.astype(object),
- self.op_name)(other)
- if self.op_name in ('difference',):
+ result = getattr(intvidx_self.astype(object), self.op_name)(other)
+ if self.op_name in ("difference",):
result = result.astype(intvidx_self.dtype)
return result
elif intvidx_self.closed != other.closed:
- msg = ('can only do set operations between two IntervalIndex '
- 'objects that are closed on the same side')
+ msg = (
+ "can only do set operations between two IntervalIndex "
+ "objects that are closed on the same side"
+ )
raise ValueError(msg)
# GH 19016: ensure set op will not return a prohibited dtype
subtypes = [intvidx_self.dtype.subtype, other.dtype.subtype]
common_subtype = find_common_type(subtypes)
if is_object_dtype(common_subtype):
- msg = ('can only do {op} between two IntervalIndex '
- 'objects that have compatible dtypes')
+ msg = (
+ "can only do {op} between two IntervalIndex "
+ "objects that have compatible dtypes"
+ )
raise TypeError(msg.format(op=self.op_name))
return setop(intvidx_self, other, sort)
+
return func
-@Appender(_interval_shared_docs['class'] % dict(
- klass="IntervalIndex",
- summary="Immutable index of intervals that are closed on the same side.",
- name=_index_doc_kwargs['name'],
- versionadded="0.20.0",
- extra_attributes="is_overlapping\nvalues\n",
- extra_methods="",
- examples=textwrap.dedent("""\
+@Appender(
+ _interval_shared_docs["class"]
+ % dict(
+ klass="IntervalIndex",
+ summary="Immutable index of intervals that are closed on the same side.",
+ name=_index_doc_kwargs["name"],
+ versionadded="0.20.0",
+ extra_attributes="is_overlapping\nvalues\n",
+ extra_methods="",
+ examples=textwrap.dedent(
+ """\
Examples
--------
A new ``IntervalIndex`` is typically constructed using
@@ -161,13 +194,14 @@ def func(intvidx_self, other, sort=False):
See further examples in the doc strings of ``interval_range`` and the
mentioned constructor methods.
- """),
-
-))
+ """
+ ),
+ )
+)
class IntervalIndex(IntervalMixin, Index):
- _typ = 'intervalindex'
- _comparables = ['name']
- _attributes = ['name', 'closed']
+ _typ = "intervalindex"
+ _comparables = ["name"]
+ _attributes = ["name", "closed"]
# we would like our indexing holder to defer to us
_defer_to_indexing = True
@@ -178,15 +212,21 @@ class IntervalIndex(IntervalMixin, Index):
# --------------------------------------------------------------------
# Constructors
- def __new__(cls, data, closed=None, dtype=None, copy=False,
- name=None, verify_integrity=True):
+ def __new__(
+ cls, data, closed=None, dtype=None, copy=False, name=None, verify_integrity=True
+ ):
- if name is None and hasattr(data, 'name'):
+ if name is None and hasattr(data, "name"):
name = data.name
with rewrite_exception("IntervalArray", cls.__name__):
- array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype,
- verify_integrity=verify_integrity)
+ array = IntervalArray(
+ data,
+ closed=closed,
+ copy=copy,
+ dtype=dtype,
+ verify_integrity=verify_integrity,
+ )
return cls._simple_new(array, name)
@@ -210,29 +250,32 @@ def _simple_new(cls, array, name, closed=None):
return result
@classmethod
- @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs)
- def from_breaks(cls, breaks, closed='right', name=None, copy=False,
- dtype=None):
+ @Appender(_interval_shared_docs["from_breaks"] % _index_doc_kwargs)
+ def from_breaks(cls, breaks, closed="right", name=None, copy=False, dtype=None):
with rewrite_exception("IntervalArray", cls.__name__):
- array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy,
- dtype=dtype)
+ array = IntervalArray.from_breaks(
+ breaks, closed=closed, copy=copy, dtype=dtype
+ )
return cls._simple_new(array, name=name)
@classmethod
- @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs)
- def from_arrays(cls, left, right, closed='right', name=None, copy=False,
- dtype=None):
+ @Appender(_interval_shared_docs["from_arrays"] % _index_doc_kwargs)
+ def from_arrays(
+ cls, left, right, closed="right", name=None, copy=False, dtype=None
+ ):
with rewrite_exception("IntervalArray", cls.__name__):
- array = IntervalArray.from_arrays(left, right, closed, copy=copy,
- dtype=dtype)
+ array = IntervalArray.from_arrays(
+ left, right, closed, copy=copy, dtype=dtype
+ )
return cls._simple_new(array, name=name)
@classmethod
- @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs)
- def from_intervals(cls, data, closed=None, name=None, copy=False,
- dtype=None):
- msg = ('IntervalIndex.from_intervals is deprecated and will be '
- 'removed in a future version; Use IntervalIndex(...) instead')
+ @Appender(_interval_shared_docs["from_intervals"] % _index_doc_kwargs)
+ def from_intervals(cls, data, closed=None, name=None, copy=False, dtype=None):
+ msg = (
+ "IntervalIndex.from_intervals is deprecated and will be "
+ "removed in a future version; Use IntervalIndex(...) instead"
+ )
warnings.warn(msg, FutureWarning, stacklevel=2)
with rewrite_exception("IntervalArray", cls.__name__):
array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype)
@@ -243,17 +286,15 @@ def from_intervals(cls, data, closed=None, name=None, copy=False,
return cls._simple_new(array, name=name)
@classmethod
- @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs)
- def from_tuples(cls, data, closed='right', name=None, copy=False,
- dtype=None):
+ @Appender(_interval_shared_docs["from_tuples"] % _index_doc_kwargs)
+ def from_tuples(cls, data, closed="right", name=None, copy=False, dtype=None):
with rewrite_exception("IntervalArray", cls.__name__):
- arr = IntervalArray.from_tuples(data, closed=closed, copy=copy,
- dtype=dtype)
+ arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype)
return cls._simple_new(arr, name=name)
# --------------------------------------------------------------------
- @Appender(_index_shared_docs['_shallow_copy'])
+ @Appender(_index_shared_docs["_shallow_copy"])
def _shallow_copy(self, left=None, right=None, **kwargs):
result = self._data._shallow_copy(left=left, right=right)
attributes = self._get_attributes_dict()
@@ -295,9 +336,11 @@ def __contains__(self, key):
except KeyError:
return False
- @Appender(_interval_shared_docs['to_tuples'] % dict(
- return_type="Index",
- examples="""
+ @Appender(
+ _interval_shared_docs["to_tuples"]
+ % dict(
+ return_type="Index",
+ examples="""
Examples
--------
>>> idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3])
@@ -305,15 +348,15 @@ def __contains__(self, key):
Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object')
>>> idx.to_tuples(na_tuple=False)
Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object')""",
- ))
+ )
+ )
def to_tuples(self, na_tuple=True):
tuples = self._data.to_tuples(na_tuple=na_tuple)
return Index(tuples)
@cache_readonly
def _multiindex(self):
- return MultiIndex.from_arrays([self.left, self.right],
- names=['left', 'right'])
+ return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"])
@property
def left(self):
@@ -339,7 +382,7 @@ def closed(self):
"""
return self._data._closed
- @Appender(_interval_shared_docs['set_closed'] % _index_doc_kwargs)
+ @Appender(_interval_shared_docs["set_closed"] % _index_doc_kwargs)
def set_closed(self, closed):
if closed not in _VALID_CLOSED:
msg = "invalid option for 'closed': {closed}"
@@ -369,13 +412,15 @@ def shape(self):
@property
def itemsize(self):
- msg = ('IntervalIndex.itemsize is deprecated and will be removed in '
- 'a future version')
+ msg = (
+ "IntervalIndex.itemsize is deprecated and will be removed in "
+ "a future version"
+ )
warnings.warn(msg, FutureWarning, stacklevel=2)
# suppress the warning from the underlying left/right itemsize
with warnings.catch_warnings():
- warnings.simplefilter('ignore')
+ warnings.simplefilter("ignore")
return self.left.itemsize + self.right.itemsize
def __len__(self):
@@ -405,12 +450,11 @@ def __array_wrap__(self, result, context=None):
return result
def __reduce__(self):
- d = dict(left=self.left,
- right=self.right)
+ d = dict(left=self.left, right=self.right)
d.update(self._get_attributes_dict())
return _new_IntervalIndex, (self.__class__, d), None
- @Appender(_index_shared_docs['copy'])
+ @Appender(_index_shared_docs["copy"])
def copy(self, deep=False, name=None):
array = self._data
if deep:
@@ -421,9 +465,9 @@ def copy(self, deep=False, name=None):
return self._simple_new(array, **attributes)
- @Appender(_index_shared_docs['astype'])
+ @Appender(_index_shared_docs["astype"])
def astype(self, dtype, copy=True):
- with rewrite_exception('IntervalArray', self.__class__.__name__):
+ with rewrite_exception("IntervalArray", self.__class__.__name__):
new_values = self.values.astype(dtype, copy=copy)
if is_interval_dtype(new_values):
return self._shallow_copy(new_values.left, new_values.right)
@@ -437,14 +481,13 @@ def dtype(self):
@property
def inferred_type(self):
"""Return a string of the type inferred from the values"""
- return 'interval'
+ return "interval"
@Appender(Index.memory_usage.__doc__)
def memory_usage(self, deep=False):
# we don't use an explicit engine
# so return the bytes here
- return (self.left.memory_usage(deep=deep) +
- self.right.memory_usage(deep=deep))
+ return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep)
@cache_readonly
def mid(self):
@@ -502,8 +545,7 @@ def is_unique(self):
return True
@cache_readonly
- @Appender(_interval_shared_docs['is_non_overlapping_monotonic']
- % _index_doc_kwargs)
+ @Appender(_interval_shared_docs["is_non_overlapping_monotonic"] % _index_doc_kwargs)
def is_non_overlapping_monotonic(self):
return self._data.is_non_overlapping_monotonic
@@ -562,16 +604,16 @@ def is_overlapping(self):
# GH 23309
return self._engine.is_overlapping
- @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ @Appender(_index_shared_docs["_convert_scalar_indexer"])
def _convert_scalar_indexer(self, key, kind=None):
- if kind == 'iloc':
+ if kind == "iloc":
return super()._convert_scalar_indexer(key, kind=kind)
return key
def _maybe_cast_slice_bound(self, label, side, kind):
return getattr(self, side)._maybe_cast_slice_bound(label, side, kind)
- @Appender(_index_shared_docs['_convert_list_indexer'])
+ @Appender(_index_shared_docs["_convert_list_indexer"])
def _convert_list_indexer(self, keyarr, kind=None):
"""
we are passed a list-like indexer. Return the
@@ -598,7 +640,7 @@ def _maybe_cast_indexed(self, key):
if is_integer(key):
key = float(key)
elif isinstance(key, (np.ndarray, Index)):
- key = key.astype('float64')
+ key = key.astype("float64")
elif is_integer_dtype(subtype):
if is_integer(key):
key = int(key)
@@ -691,8 +733,10 @@ def _maybe_convert_i8(self, key):
# ensure consistency with IntervalIndex subtype
subtype = self.dtype.subtype
- msg = ('Cannot index an IntervalIndex of subtype {subtype} with '
- 'values of dtype {other}')
+ msg = (
+ "Cannot index an IntervalIndex of subtype {subtype} with "
+ "values of dtype {other}"
+ )
if not is_dtype_equal(subtype, key_dtype):
raise ValueError(msg.format(subtype=subtype, other=key_dtype))
@@ -702,27 +746,30 @@ def _check_method(self, method):
if method is None:
return
- if method in ['bfill', 'backfill', 'pad', 'ffill', 'nearest']:
- msg = 'method {method} not yet implemented for IntervalIndex'
+ if method in ["bfill", "backfill", "pad", "ffill", "nearest"]:
+ msg = "method {method} not yet implemented for IntervalIndex"
raise NotImplementedError(msg.format(method=method))
raise ValueError("Invalid fill method")
def _searchsorted_monotonic(self, label, side, exclude_label=False):
if not self.is_non_overlapping_monotonic:
- raise KeyError('can only get slices from an IntervalIndex if '
- 'bounds are non-overlapping and all monotonic '
- 'increasing or decreasing')
+ raise KeyError(
+ "can only get slices from an IntervalIndex if "
+ "bounds are non-overlapping and all monotonic "
+ "increasing or decreasing"
+ )
if isinstance(label, IntervalMixin):
- msg = 'Interval objects are not currently supported'
+ msg = "Interval objects are not currently supported"
raise NotImplementedError(msg)
# GH 20921: "not is_monotonic_increasing" for the second condition
# instead of "is_monotonic_decreasing" to account for single element
# indexes being both increasing and decreasing
- if ((side == 'left' and self.left.is_monotonic_increasing) or
- (side == 'right' and not self.left.is_monotonic_increasing)):
+ if (side == "left" and self.left.is_monotonic_increasing) or (
+ side == "right" and not self.left.is_monotonic_increasing
+ ):
sub_idx = self.right
if self.open_right or exclude_label:
label = _get_next_label(label)
@@ -736,9 +783,11 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False):
def _find_non_overlapping_monotonic_bounds(self, key):
if isinstance(key, IntervalMixin):
start = self._searchsorted_monotonic(
- key.left, 'left', exclude_label=key.open_left)
+ key.left, "left", exclude_label=key.open_left
+ )
stop = self._searchsorted_monotonic(
- key.right, 'right', exclude_label=key.open_right)
+ key.right, "right", exclude_label=key.open_right
+ )
elif isinstance(key, slice):
# slice
start, stop = key.start, key.stop
@@ -747,22 +796,21 @@ def _find_non_overlapping_monotonic_bounds(self, key):
if start is None:
start = 0
else:
- start = self._searchsorted_monotonic(start, 'left')
+ start = self._searchsorted_monotonic(start, "left")
if stop is None:
stop = len(self)
else:
- stop = self._searchsorted_monotonic(stop, 'right')
+ stop = self._searchsorted_monotonic(stop, "right")
else:
# scalar or index-like
- start = self._searchsorted_monotonic(key, 'left')
- stop = self._searchsorted_monotonic(key, 'right')
+ start = self._searchsorted_monotonic(key, "left")
+ stop = self._searchsorted_monotonic(key, "right")
return start, stop
- def get_loc(self,
- key: Any,
- method: Optional[str] = None
- ) -> Union[int, slice, np.ndarray]:
+ def get_loc(
+ self, key: Any, method: Optional[str] = None
+ ) -> Union[int, slice, np.ndarray]:
"""
Get integer location, slice or boolean mask for requested label.
@@ -827,29 +875,40 @@ def get_loc(self,
raise KeyError(key)
elif matches == 1:
return mask.argmax()
- return lib.maybe_booleans_to_slice(mask.view('u1'))
-
- @Substitution(**dict(_index_doc_kwargs,
- **{'raises_section': textwrap.dedent("""
+ return lib.maybe_booleans_to_slice(mask.view("u1"))
+
+ @Substitution(
+ **dict(
+ _index_doc_kwargs,
+ **{
+ "raises_section": textwrap.dedent(
+ """
Raises
------
NotImplementedError
If any method argument other than the default of
None is specified as these are not yet implemented.
- """)}))
- @Appender(_index_shared_docs['get_indexer'])
- def get_indexer(self,
- target: AnyArrayLike,
- method: Optional[str] = None,
- limit: Optional[int] = None,
- tolerance: Optional[Any] = None
- ) -> np.ndarray:
+ """
+ )
+ }
+ )
+ )
+ @Appender(_index_shared_docs["get_indexer"])
+ def get_indexer(
+ self,
+ target: AnyArrayLike,
+ method: Optional[str] = None,
+ limit: Optional[int] = None,
+ tolerance: Optional[Any] = None,
+ ) -> np.ndarray:
self._check_method(method)
if self.is_overlapping:
- msg = ('cannot handle overlapping indices; use '
- 'IntervalIndex.get_indexer_non_unique')
+ msg = (
+ "cannot handle overlapping indices; use "
+ "IntervalIndex.get_indexer_non_unique"
+ )
raise InvalidIndexError(msg)
target = ensure_index(target)
@@ -857,11 +916,12 @@ def get_indexer(self,
if isinstance(target, IntervalIndex):
# equal indexes -> 1:1 positional match
if self.equals(target):
- return np.arange(len(self), dtype='intp')
+ return np.arange(len(self), dtype="intp")
# different closed or incompatible subtype -> no matches
- common_subtype = find_common_type([
- self.dtype.subtype, target.dtype.subtype])
+ common_subtype = find_common_type(
+ [self.dtype.subtype, target.dtype.subtype]
+ )
if self.closed != target.closed or is_object_dtype(common_subtype):
return np.repeat(np.intp(-1), len(target))
@@ -888,16 +948,17 @@ def get_indexer(self,
return ensure_platform_int(indexer)
- @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
- def get_indexer_non_unique(self,
- target: AnyArrayLike
- ) -> Tuple[np.ndarray, np.ndarray]:
+ @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
+ def get_indexer_non_unique(
+ self, target: AnyArrayLike
+ ) -> Tuple[np.ndarray, np.ndarray]:
target = ensure_index(target)
# check that target IntervalIndex is compatible
if isinstance(target, IntervalIndex):
- common_subtype = find_common_type([
- self.dtype.subtype, target.dtype.subtype])
+ common_subtype = find_common_type(
+ [self.dtype.subtype, target.dtype.subtype]
+ )
if self.closed != target.closed or is_object_dtype(common_subtype):
# different closed or incompatible subtype -> no matches
return np.repeat(-1, len(target)), np.arange(len(target))
@@ -909,8 +970,7 @@ def get_indexer_non_unique(self,
try:
locs = self.get_loc(key)
if isinstance(locs, slice):
- locs = np.arange(
- locs.start, locs.stop, locs.step, dtype='intp')
+ locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp")
locs = np.array(locs, ndmin=1)
except KeyError:
missing.append(i)
@@ -919,15 +979,11 @@ def get_indexer_non_unique(self,
indexer = np.concatenate(indexer)
else:
target = self._maybe_convert_i8(target)
- indexer, missing = self._engine.get_indexer_non_unique(
- target.values)
+ indexer, missing = self._engine.get_indexer_non_unique(target.values)
return ensure_platform_int(indexer), ensure_platform_int(missing)
- def get_indexer_for(self,
- target: AnyArrayLike,
- **kwargs
- ) -> np.ndarray:
+ def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray:
"""
Guaranteed return of an indexer even when overlapping.
@@ -943,11 +999,8 @@ def get_indexer_for(self,
return self.get_indexer_non_unique(target, **kwargs)[0]
return self.get_indexer(target, **kwargs)
- @Appender(_index_shared_docs['get_value'] % _index_doc_kwargs)
- def get_value(self,
- series: ABCSeries,
- key: Any
- ) -> Any:
+ @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs)
+ def get_value(self, series: ABCSeries, key: Any) -> Any:
if com.is_bool_indexer(key):
loc = key
@@ -961,12 +1014,12 @@ def get_value(self,
elif isinstance(key, slice):
if not (key.step is None or key.step == 1):
raise ValueError("cannot support not-default step in a slice")
- loc = self._convert_slice_indexer(key, kind='getitem')
+ loc = self._convert_slice_indexer(key, kind="getitem")
else:
loc = self.get_loc(key)
return series.iloc[loc]
- @Appender(_index_shared_docs['where'])
+ @Appender(_index_shared_docs["where"])
def where(self, cond, other=None):
if other is None:
other = self._na_value
@@ -1002,16 +1055,18 @@ def insert(self, loc, item):
"""
if isinstance(item, Interval):
if item.closed != self.closed:
- raise ValueError('inserted item must be closed on the same '
- 'side as the index')
+ raise ValueError(
+ "inserted item must be closed on the same " "side as the index"
+ )
left_insert = item.left
right_insert = item.right
elif is_scalar(item) and isna(item):
# GH 18295
left_insert = right_insert = item
else:
- raise ValueError('can only insert Interval objects and NA into '
- 'an IntervalIndex')
+ raise ValueError(
+ "can only insert Interval objects and NA into " "an IntervalIndex"
+ )
new_left = self.left.insert(loc, left_insert)
new_right = self.right.insert(loc, right_insert)
@@ -1023,16 +1078,18 @@ def _concat_same_dtype(self, to_concat, name):
we allow a 0-len index here as well
"""
if not len({i.closed for i in to_concat if len(i)}) == 1:
- msg = ('can only append two IntervalIndex objects '
- 'that are closed on the same side')
+ msg = (
+ "can only append two IntervalIndex objects "
+ "that are closed on the same side"
+ )
raise ValueError(msg)
return super()._concat_same_dtype(to_concat, name)
- @Appender(_index_shared_docs['take'] % _index_doc_kwargs)
- def take(self, indices, axis=0, allow_fill=True,
- fill_value=None, **kwargs):
- result = self._data.take(indices, axis=axis, allow_fill=allow_fill,
- fill_value=fill_value, **kwargs)
+ @Appender(_index_shared_docs["take"] % _index_doc_kwargs)
+ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs):
+ result = self._data.take(
+ indices, axis=axis, allow_fill=allow_fill, fill_value=fill_value, **kwargs
+ )
attributes = self._get_attributes_dict()
return self._simple_new(result, **attributes)
@@ -1051,56 +1108,56 @@ def __getitem__(self, value):
def _format_with_header(self, header, **kwargs):
return header + list(self._format_native_types(**kwargs))
- def _format_native_types(self, na_rep='NaN', quoting=None, **kwargs):
+ def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs):
""" actually format my specific types """
from pandas.io.formats.format import ExtensionArrayFormatter
- return ExtensionArrayFormatter(values=self,
- na_rep=na_rep,
- justify='all',
- leading_space=False).get_result()
+
+ return ExtensionArrayFormatter(
+ values=self, na_rep=na_rep, justify="all", leading_space=False
+ ).get_result()
def _format_data(self, name=None):
# TODO: integrate with categorical and make generic
# name argument is unused here; just for compat with base / categorical
n = len(self)
- max_seq_items = min((get_option(
- 'display.max_seq_items') or n) // 10, 10)
+ max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10)
formatter = str
if n == 0:
- summary = '[]'
+ summary = "[]"
elif n == 1:
first = formatter(self[0])
- summary = '[{first}]'.format(first=first)
+ summary = "[{first}]".format(first=first)
elif n == 2:
first = formatter(self[0])
last = formatter(self[-1])
- summary = '[{first}, {last}]'.format(first=first, last=last)
+ summary = "[{first}, {last}]".format(first=first, last=last)
else:
if n > max_seq_items:
n = min(max_seq_items // 2, 10)
head = [formatter(x) for x in self[:n]]
tail = [formatter(x) for x in self[-n:]]
- summary = '[{head} ... {tail}]'.format(
- head=', '.join(head), tail=', '.join(tail))
+ summary = "[{head} ... {tail}]".format(
+ head=", ".join(head), tail=", ".join(tail)
+ )
else:
tail = [formatter(x) for x in self]
- summary = '[{tail}]'.format(tail=', '.join(tail))
+ summary = "[{tail}]".format(tail=", ".join(tail))
- return summary + ',' + self._format_space()
+ return summary + "," + self._format_space()
def _format_attrs(self):
- attrs = [('closed', repr(self.closed))]
+ attrs = [("closed", repr(self.closed))]
if self.name is not None:
- attrs.append(('name', default_pprint(self.name)))
- attrs.append(('dtype', "'{dtype}'".format(dtype=self.dtype)))
+ attrs.append(("name", default_pprint(self.name)))
+ attrs.append(("dtype", "'{dtype}'".format(dtype=self.dtype)))
return attrs
def _format_space(self):
- space = ' ' * (len(self.__class__.__name__) + 1)
+ space = " " * (len(self.__class__.__name__) + 1)
return "\n{space}".format(space=space)
# --------------------------------------------------------------------
@@ -1120,30 +1177,30 @@ def equals(self, other):
if not isinstance(other, IntervalIndex):
if not is_interval_dtype(other):
return False
- other = Index(getattr(other, '.values', other))
+ other = Index(getattr(other, ".values", other))
- return (self.left.equals(other.left) and
- self.right.equals(other.right) and
- self.closed == other.closed)
+ return (
+ self.left.equals(other.left)
+ and self.right.equals(other.right)
+ and self.closed == other.closed
+ )
- @Appender(_interval_shared_docs['contains'] % _index_doc_kwargs)
+ @Appender(_interval_shared_docs["contains"] % _index_doc_kwargs)
def contains(self, other):
return self._data.contains(other)
- @Appender(_interval_shared_docs['overlaps'] % _index_doc_kwargs)
+ @Appender(_interval_shared_docs["overlaps"] % _index_doc_kwargs)
def overlaps(self, other):
return self._data.overlaps(other)
- @Appender(_index_shared_docs['intersection'])
- @SetopCheck(op_name='intersection')
- def intersection(self,
- other: 'IntervalIndex',
- sort: bool = False
- ) -> 'IntervalIndex':
+ @Appender(_index_shared_docs["intersection"])
+ @SetopCheck(op_name="intersection")
+ def intersection(
+ self, other: "IntervalIndex", sort: bool = False
+ ) -> "IntervalIndex":
if self.left.is_unique and self.right.is_unique:
taken = self._intersection_unique(other)
- elif (other.left.is_unique and other.right.is_unique and
- self.isna().sum() <= 1):
+ elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1:
# Swap other/self if other is unique and self does not have
# multiple NaNs
taken = other._intersection_unique(self)
@@ -1156,9 +1213,7 @@ def intersection(self,
return taken
- def _intersection_unique(self,
- other: 'IntervalIndex'
- ) -> 'IntervalIndex':
+ def _intersection_unique(self, other: "IntervalIndex") -> "IntervalIndex":
"""
Used when the IntervalIndex does not have any common endpoint,
no mater left or right.
@@ -1180,9 +1235,7 @@ def _intersection_unique(self,
return self.take(indexer)
- def _intersection_non_unique(self,
- other: 'IntervalIndex'
- ) -> 'IntervalIndex':
+ def _intersection_non_unique(self, other: "IntervalIndex") -> "IntervalIndex":
"""
Used when the IntervalIndex does have some common endpoints,
on either sides.
@@ -1218,8 +1271,7 @@ def _intersection_non_unique(self,
def _setop(op_name, sort=None):
@SetopCheck(op_name=op_name)
def func(self, other, sort=sort):
- result = getattr(self._multiindex, op_name)(other._multiindex,
- sort=sort)
+ result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort)
result_name = get_op_result_name(self, other)
# GH 19101: ensure empty results have correct dtype
@@ -1228,8 +1280,7 @@ def func(self, other, sort=sort):
else:
result = result.values
- return type(self).from_tuples(result, closed=self.closed,
- name=result_name)
+ return type(self).from_tuples(result, closed=self.closed, name=result_name)
return func
@@ -1241,9 +1292,9 @@ def is_all_dates(self):
"""
return False
- union = _setop('union')
- difference = _setop('difference')
- symmetric_difference = _setop('symmetric_difference')
+ union = _setop("union")
+ difference = _setop("difference")
+ symmetric_difference = _setop("symmetric_difference")
# TODO: arithmetic operations
@@ -1253,24 +1304,31 @@ def is_all_dates(self):
def _is_valid_endpoint(endpoint):
"""helper for interval_range to check if start/end are valid types"""
- return any([is_number(endpoint),
- isinstance(endpoint, Timestamp),
- isinstance(endpoint, Timedelta),
- endpoint is None])
+ return any(
+ [
+ is_number(endpoint),
+ isinstance(endpoint, Timestamp),
+ isinstance(endpoint, Timedelta),
+ endpoint is None,
+ ]
+ )
def _is_type_compatible(a, b):
"""helper for interval_range to check type compat of start/end/freq"""
is_ts_compat = lambda x: isinstance(x, (Timestamp, DateOffset))
is_td_compat = lambda x: isinstance(x, (Timedelta, DateOffset))
- return ((is_number(a) and is_number(b)) or
- (is_ts_compat(a) and is_ts_compat(b)) or
- (is_td_compat(a) and is_td_compat(b)) or
- com._any_none(a, b))
+ return (
+ (is_number(a) and is_number(b))
+ or (is_ts_compat(a) and is_ts_compat(b))
+ or (is_td_compat(a) and is_td_compat(b))
+ or com._any_none(a, b)
+ )
-def interval_range(start=None, end=None, periods=None, freq=None,
- name=None, closed='right'):
+def interval_range(
+ start=None, end=None, periods=None, freq=None, name=None, closed="right"
+):
"""
Return a fixed frequency IntervalIndex
@@ -1363,36 +1421,44 @@ def interval_range(start=None, end=None, periods=None, freq=None,
endpoint = start if start is not None else end
if freq is None and com._any_none(periods, start, end):
- freq = 1 if is_number(endpoint) else 'D'
+ freq = 1 if is_number(endpoint) else "D"
if com.count_not_none(start, end, periods, freq) != 3:
- raise ValueError('Of the four parameters: start, end, periods, and '
- 'freq, exactly three must be specified')
+ raise ValueError(
+ "Of the four parameters: start, end, periods, and "
+ "freq, exactly three must be specified"
+ )
if not _is_valid_endpoint(start):
- msg = 'start must be numeric or datetime-like, got {start}'
+ msg = "start must be numeric or datetime-like, got {start}"
raise ValueError(msg.format(start=start))
elif not _is_valid_endpoint(end):
- msg = 'end must be numeric or datetime-like, got {end}'
+ msg = "end must be numeric or datetime-like, got {end}"
raise ValueError(msg.format(end=end))
if is_float(periods):
periods = int(periods)
elif not is_integer(periods) and periods is not None:
- msg = 'periods must be a number, got {periods}'
+ msg = "periods must be a number, got {periods}"
raise TypeError(msg.format(periods=periods))
if freq is not None and not is_number(freq):
try:
freq = to_offset(freq)
except ValueError:
- raise ValueError('freq must be numeric or convertible to '
- 'DateOffset, got {freq}'.format(freq=freq))
+ raise ValueError(
+ "freq must be numeric or convertible to "
+ "DateOffset, got {freq}".format(freq=freq)
+ )
# verify type compatibility
- if not all([_is_type_compatible(start, end),
- _is_type_compatible(start, freq),
- _is_type_compatible(end, freq)]):
+ if not all(
+ [
+ _is_type_compatible(start, end),
+ _is_type_compatible(start, freq),
+ _is_type_compatible(end, freq),
+ ]
+ ):
raise TypeError("start, end, freq need to be type compatible")
# +1 to convert interval count to breaks count (n breaks = n-1 intervals)
@@ -1415,7 +1481,7 @@ def interval_range(start=None, end=None, periods=None, freq=None,
breaks = np.linspace(start, end, periods)
if all(is_integer(x) for x in com._not_none(start, end, freq)):
# np.linspace always produces float output
- breaks = maybe_downcast_to_dtype(breaks, 'int64')
+ breaks = maybe_downcast_to_dtype(breaks, "int64")
else:
# delegate to the appropriate range function
if isinstance(endpoint, Timestamp):
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 0823a3ed9ad59..71b551adaf3ef 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -7,16 +7,23 @@
from pandas._config import get_option
-from pandas._libs import (
- Timestamp, algos as libalgos, index as libindex, lib, tslibs)
+from pandas._libs import Timestamp, algos as libalgos, index as libindex, lib, tslibs
from pandas.compat.numpy import function as nv
from pandas.errors import PerformanceWarning, UnsortedIndexError
from pandas.util._decorators import Appender, cache_readonly, deprecate_kwarg
from pandas.core.dtypes.common import (
- ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable,
- is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar,
- pandas_dtype)
+ ensure_int64,
+ ensure_platform_int,
+ is_categorical_dtype,
+ is_hashable,
+ is_integer,
+ is_iterator,
+ is_list_like,
+ is_object_dtype,
+ is_scalar,
+ pandas_dtype,
+)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import ABCDataFrame
from pandas.core.dtypes.missing import array_equivalent, isna
@@ -25,25 +32,32 @@
import pandas.core.common as com
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import (
- Index, InvalidIndexError, _index_shared_docs, ensure_index)
+ Index,
+ InvalidIndexError,
+ _index_shared_docs,
+ ensure_index,
+)
from pandas.core.indexes.frozen import FrozenList, _ensure_frozen
import pandas.core.missing as missing
from pandas.io.formats.printing import (
- format_object_attrs, format_object_summary, pprint_thing)
+ format_object_attrs,
+ format_object_summary,
+ pprint_thing,
+)
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
_index_doc_kwargs.update(
- dict(klass='MultiIndex',
- target_klass='MultiIndex or list of tuples'))
+ dict(klass="MultiIndex", target_klass="MultiIndex or list of tuples")
+)
-class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine,
- libindex.UInt64Engine):
+class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine):
"""
This class manages a MultiIndex by mapping label combinations to positive
integers.
"""
+
_base = libindex.UInt64Engine
def _codes_to_ints(self, codes):
@@ -77,13 +91,13 @@ def _codes_to_ints(self, codes):
return np.bitwise_or.reduce(codes, axis=1)
-class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine,
- libindex.ObjectEngine):
+class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine):
"""
This class manages those (extreme) cases in which the number of possible
label combinations overflows the 64 bits integers, and uses an ObjectEngine
containing Python integers.
"""
+
_base = libindex.ObjectEngine
def _codes_to_ints(self, codes):
@@ -106,7 +120,7 @@ def _codes_to_ints(self, codes):
# Shift the representation of each level by the pre-calculated number
# of bits. Since this can overflow uint64, first make sure we are
# working with Python integers:
- codes = codes.astype('object') << self.offsets
+ codes = codes.astype("object") << self.offsets
# Now sum and OR are in fact interchangeable. This is a simple
# composition of the (disjunct) significant bits of each level (i.e.
@@ -205,20 +219,29 @@ class MultiIndex(Index):
"""
# initialize to zero-length tuples to make everything work
- _typ = 'multiindex'
+ _typ = "multiindex"
_names = FrozenList()
_levels = FrozenList()
_codes = FrozenList()
- _comparables = ['names']
+ _comparables = ["names"]
rename = Index.set_names
# --------------------------------------------------------------------
# Constructors
- @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes')
- def __new__(cls, levels=None, codes=None, sortorder=None, names=None,
- dtype=None, copy=False, name=None,
- verify_integrity=True, _set_identity=True):
+ @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes")
+ def __new__(
+ cls,
+ levels=None,
+ codes=None,
+ sortorder=None,
+ names=None,
+ dtype=None,
+ copy=False,
+ name=None,
+ verify_integrity=True,
+ _set_identity=True,
+ ):
# compat with Index
if name is not None:
@@ -226,9 +249,9 @@ def __new__(cls, levels=None, codes=None, sortorder=None, names=None,
if levels is None or codes is None:
raise TypeError("Must pass both levels and codes")
if len(levels) != len(codes):
- raise ValueError('Length of levels and codes must be the same.')
+ raise ValueError("Length of levels and codes must be the same.")
if len(levels) == 0:
- raise ValueError('Must pass non-zero number of levels/codes')
+ raise ValueError("Must pass non-zero number of levels/codes")
result = object.__new__(MultiIndex)
@@ -302,32 +325,39 @@ def _verify_integrity(self, codes=None, levels=None):
levels = levels or self.levels
if len(levels) != len(codes):
- raise ValueError("Length of levels and codes must match. NOTE:"
- " this index is in an inconsistent state.")
+ raise ValueError(
+ "Length of levels and codes must match. NOTE:"
+ " this index is in an inconsistent state."
+ )
codes_length = len(codes[0])
for i, (level, level_codes) in enumerate(zip(levels, codes)):
if len(level_codes) != codes_length:
- raise ValueError("Unequal code lengths: %s" %
- ([len(code_) for code_ in codes]))
+ raise ValueError(
+ "Unequal code lengths: %s" % ([len(code_) for code_ in codes])
+ )
if len(level_codes) and level_codes.max() >= len(level):
- msg = ("On level {level}, code max ({max_code}) >= length of "
- "level ({level_len}). NOTE: this index is in an "
- "inconsistent state".format(
- level=i, max_code=level_codes.max(),
- level_len=len(level)))
+ msg = (
+ "On level {level}, code max ({max_code}) >= length of "
+ "level ({level_len}). NOTE: this index is in an "
+ "inconsistent state".format(
+ level=i, max_code=level_codes.max(), level_len=len(level)
+ )
+ )
raise ValueError(msg)
if len(level_codes) and level_codes.min() < -1:
- raise ValueError("On level {level}, code value ({code})"
- " < -1".format(
- level=i, code=level_codes.min()))
+ raise ValueError(
+ "On level {level}, code value ({code})"
+ " < -1".format(level=i, code=level_codes.min())
+ )
if not level.is_unique:
- raise ValueError("Level values must be unique: {values} on "
- "level {level}".format(
- values=[value for value in level],
- level=i))
-
- codes = [self._validate_codes(level, code)
- for level, code in zip(levels, codes)]
+ raise ValueError(
+ "Level values must be unique: {values} on "
+ "level {level}".format(values=[value for value in level], level=i)
+ )
+
+ codes = [
+ self._validate_codes(level, code) for level, code in zip(levels, codes)
+ ]
new_codes = FrozenList(codes)
return new_codes
@@ -383,7 +413,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
# raise ValueError, if not
for i in range(1, len(arrays)):
if len(arrays[i]) != len(arrays[i - 1]):
- raise ValueError('all arrays must be same length')
+ raise ValueError("all arrays must be same length")
from pandas.core.arrays.categorical import _factorize_from_iterables
@@ -391,8 +421,13 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
if names is None:
names = [getattr(arr, "name", None) for arr in arrays]
- return MultiIndex(levels=levels, codes=codes, sortorder=sortorder,
- names=names, verify_integrity=False)
+ return MultiIndex(
+ levels=levels,
+ codes=codes,
+ sortorder=sortorder,
+ names=names,
+ verify_integrity=False,
+ )
@classmethod
def from_tuples(cls, tuples, sortorder=None, names=None):
@@ -432,13 +467,13 @@ def from_tuples(cls, tuples, sortorder=None, names=None):
names=['number', 'color'])
"""
if not is_list_like(tuples):
- raise TypeError('Input must be a list / sequence of tuple-likes.')
+ raise TypeError("Input must be a list / sequence of tuple-likes.")
elif is_iterator(tuples):
tuples = list(tuples)
if len(tuples) == 0:
if names is None:
- msg = 'Cannot infer number of levels from empty list'
+ msg = "Cannot infer number of levels from empty list"
raise TypeError(msg)
arrays = [[]] * len(names)
elif isinstance(tuples, (np.ndarray, Index)):
@@ -591,8 +626,10 @@ def array(self):
------
ValueError
"""
- msg = ("MultiIndex has no single backing array. Use "
- "'MultiIndex.to_numpy()' to get a NumPy array of tuples.")
+ msg = (
+ "MultiIndex has no single backing array. Use "
+ "'MultiIndex.to_numpy()' to get a NumPy array of tuples."
+ )
raise ValueError(msg)
@property
@@ -617,22 +654,23 @@ def _is_homogeneous_type(self):
"""
return len({x.dtype for x in self.levels}) <= 1
- def _set_levels(self, levels, level=None, copy=False, validate=True,
- verify_integrity=False):
+ def _set_levels(
+ self, levels, level=None, copy=False, validate=True, verify_integrity=False
+ ):
# This is NOT part of the levels property because it should be
# externally not allowed to set levels. User beware if you change
# _levels directly
if validate and len(levels) == 0:
- raise ValueError('Must set non-zero number of levels.')
+ raise ValueError("Must set non-zero number of levels.")
if validate and level is None and len(levels) != self.nlevels:
- raise ValueError('Length of levels must match number of levels.')
+ raise ValueError("Length of levels must match number of levels.")
if validate and level is not None and len(levels) != len(level):
- raise ValueError('Length of levels must match length of level.')
+ raise ValueError("Length of levels must match length of level.")
if level is None:
new_levels = FrozenList(
- ensure_index(lev, copy=copy)._shallow_copy()
- for lev in levels)
+ ensure_index(lev, copy=copy)._shallow_copy() for lev in levels
+ )
else:
level = [self._get_level_number(l) for l in level]
new_levels = list(self._levels)
@@ -652,8 +690,7 @@ def _set_levels(self, levels, level=None, copy=False, validate=True,
self._tuples = None
self._reset_cache()
- def set_levels(self, levels, level=None, inplace=False,
- verify_integrity=True):
+ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True):
"""
Set new levels on MultiIndex. Defaults to returning
new index.
@@ -722,8 +759,9 @@ def set_levels(self, levels, level=None, inplace=False,
else:
idx = self._shallow_copy()
idx._reset_identity()
- idx._set_levels(levels, level=level, validate=True,
- verify_integrity=verify_integrity)
+ idx._set_levels(
+ levels, level=level, validate=True, verify_integrity=verify_integrity
+ )
if not inplace:
return idx
@@ -733,29 +771,34 @@ def codes(self):
@property
def labels(self):
- warnings.warn((".labels was deprecated in version 0.24.0. "
- "Use .codes instead."),
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ (".labels was deprecated in version 0.24.0. " "Use .codes instead."),
+ FutureWarning,
+ stacklevel=2,
+ )
return self.codes
- def _set_codes(self, codes, level=None, copy=False, validate=True,
- verify_integrity=False):
+ def _set_codes(
+ self, codes, level=None, copy=False, validate=True, verify_integrity=False
+ ):
if validate and level is None and len(codes) != self.nlevels:
raise ValueError("Length of codes must match number of levels")
if validate and level is not None and len(codes) != len(level):
- raise ValueError('Length of codes must match length of levels.')
+ raise ValueError("Length of codes must match length of levels.")
if level is None:
new_codes = FrozenList(
_ensure_frozen(level_codes, lev, copy=copy)._shallow_copy()
- for lev, level_codes in zip(self.levels, codes))
+ for lev, level_codes in zip(self.levels, codes)
+ )
else:
level = [self._get_level_number(l) for l in level]
new_codes = list(self._codes)
for lev_idx, level_codes in zip(level, codes):
lev = self.levels[lev_idx]
new_codes[lev_idx] = _ensure_frozen(
- level_codes, lev, copy=copy)._shallow_copy()
+ level_codes, lev, copy=copy
+ )._shallow_copy()
new_codes = FrozenList(new_codes)
if verify_integrity:
@@ -766,17 +809,24 @@ def _set_codes(self, codes, level=None, copy=False, validate=True,
self._tuples = None
self._reset_cache()
- def set_labels(self, labels, level=None, inplace=False,
- verify_integrity=True):
- warnings.warn((".set_labels was deprecated in version 0.24.0. "
- "Use .set_codes instead."),
- FutureWarning, stacklevel=2)
- return self.set_codes(codes=labels, level=level, inplace=inplace,
- verify_integrity=verify_integrity)
+ def set_labels(self, labels, level=None, inplace=False, verify_integrity=True):
+ warnings.warn(
+ (
+ ".set_labels was deprecated in version 0.24.0. "
+ "Use .set_codes instead."
+ ),
+ FutureWarning,
+ stacklevel=2,
+ )
+ return self.set_codes(
+ codes=labels,
+ level=level,
+ inplace=inplace,
+ verify_integrity=verify_integrity,
+ )
- @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes')
- def set_codes(self, codes, level=None, inplace=False,
- verify_integrity=True):
+ @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes")
+ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True):
"""
Set new codes on MultiIndex. Defaults to returning
new index.
@@ -852,9 +902,17 @@ def set_codes(self, codes, level=None, inplace=False,
if not inplace:
return idx
- @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes')
- def copy(self, names=None, dtype=None, levels=None, codes=None,
- deep=False, _set_identity=False, **kwargs):
+ @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes")
+ def copy(
+ self,
+ names=None,
+ dtype=None,
+ levels=None,
+ codes=None,
+ deep=False,
+ _set_identity=False,
+ **kwargs
+ ):
"""
Make a copy of this object. Names, dtype, levels and codes can be
passed and will be set on new copy.
@@ -876,11 +934,12 @@ def copy(self, names=None, dtype=None, levels=None, codes=None,
``deep``, but if ``deep`` is passed it will attempt to deepcopy.
This could be potentially expensive on large MultiIndex objects.
"""
- name = kwargs.get('name')
+ name = kwargs.get("name")
names = self._validate_names(name=name, names=names, deep=deep)
if deep:
from copy import deepcopy
+
if levels is None:
levels = deepcopy(self.levels)
if codes is None:
@@ -890,9 +949,14 @@ def copy(self, names=None, dtype=None, levels=None, codes=None,
levels = self.levels
if codes is None:
codes = self.codes
- return MultiIndex(levels=levels, codes=codes, names=names,
- sortorder=self.sortorder, verify_integrity=False,
- _set_identity=_set_identity)
+ return MultiIndex(
+ levels=levels,
+ codes=codes,
+ names=names,
+ sortorder=self.sortorder,
+ verify_integrity=False,
+ _set_identity=_set_identity,
+ )
def __array__(self, dtype=None):
""" the array interface, return my values """
@@ -908,12 +972,14 @@ def _shallow_copy_with_infer(self, values, **kwargs):
# On equal MultiIndexes the difference is empty.
# Therefore, an empty MultiIndex is returned GH13490
if len(values) == 0:
- return MultiIndex(levels=[[] for _ in range(self.nlevels)],
- codes=[[] for _ in range(self.nlevels)],
- **kwargs)
+ return MultiIndex(
+ levels=[[] for _ in range(self.nlevels)],
+ codes=[[] for _ in range(self.nlevels)],
+ **kwargs
+ )
return self._shallow_copy(values, **kwargs)
- @Appender(_index_shared_docs['contains'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["contains"] % _index_doc_kwargs)
def __contains__(self, key):
hash(key)
try:
@@ -922,23 +988,25 @@ def __contains__(self, key):
except (LookupError, TypeError, ValueError):
return False
- @Appender(_index_shared_docs['_shallow_copy'])
+ @Appender(_index_shared_docs["_shallow_copy"])
def _shallow_copy(self, values=None, **kwargs):
if values is not None:
- names = kwargs.pop('names', kwargs.pop('name', self.names))
+ names = kwargs.pop("names", kwargs.pop("name", self.names))
# discards freq
- kwargs.pop('freq', None)
+ kwargs.pop("freq", None)
return MultiIndex.from_tuples(values, names=names, **kwargs)
return self.copy(**kwargs)
@cache_readonly
def dtype(self):
- return np.dtype('O')
+ return np.dtype("O")
def _is_memory_usage_qualified(self):
""" return a boolean if we need a qualified .info display """
+
def f(l):
- return 'mixed' in l or 'string' in l or 'unicode' in l
+ return "mixed" in l or "string" in l or "unicode" in l
+
return any(f(l) for l in self._inferred_type_levels)
@Appender(Index.memory_usage.__doc__)
@@ -989,8 +1057,9 @@ def _format_data(self, name=None):
"""
Return the formatted data as a unicode string
"""
- return format_object_summary(self, self._formatter_func,
- name=name, line_break_each_value=True)
+ return format_object_summary(
+ self, self._formatter_func, name=name, line_break_each_value=True
+ )
def _format_attrs(self):
"""
@@ -998,7 +1067,7 @@ def _format_attrs(self):
"""
return format_object_attrs(self, include_dtype=False)
- def _format_native_types(self, na_rep='nan', **kwargs):
+ def _format_native_types(self, na_rep="nan", **kwargs):
new_levels = []
new_codes = []
@@ -1006,7 +1075,7 @@ def _format_native_types(self, na_rep='nan', **kwargs):
for level, level_codes in zip(self.levels, self.codes):
level = level._format_native_types(na_rep=na_rep, **kwargs)
# add nan values, if there are any
- mask = (level_codes == -1)
+ mask = level_codes == -1
if mask.any():
nan_index = len(level)
level = np.append(level, na_rep)
@@ -1017,17 +1086,27 @@ def _format_native_types(self, na_rep='nan', **kwargs):
if len(new_levels) == 1:
# a single-level multi-index
- return Index(new_levels[0].take(
- new_codes[0]))._format_native_types()
+ return Index(new_levels[0].take(new_codes[0]))._format_native_types()
else:
# reconstruct the multi-index
- mi = MultiIndex(levels=new_levels, codes=new_codes,
- names=self.names, sortorder=self.sortorder,
- verify_integrity=False)
+ mi = MultiIndex(
+ levels=new_levels,
+ codes=new_codes,
+ names=self.names,
+ sortorder=self.sortorder,
+ verify_integrity=False,
+ )
return mi.values
- def format(self, space=2, sparsify=None, adjoin=True, names=False,
- na_rep=None, formatter=None):
+ def format(
+ self,
+ space=2,
+ sparsify=None,
+ adjoin=True,
+ names=False,
+ na_rep=None,
+ formatter=None,
+ ):
if len(self) == 0:
return []
@@ -1048,9 +1127,10 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False,
else:
# weird all NA case
- formatted = [pprint_thing(na if isna(x) else x,
- escape_chars=('\t', '\r', '\n'))
- for x in algos.take_1d(lev._values, level_codes)]
+ formatted = [
+ pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n"))
+ for x in algos.take_1d(lev._values, level_codes)
+ ]
stringified_levels.append(formatted)
result_levels = []
@@ -1058,9 +1138,11 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False,
level = []
if names:
- level.append(pprint_thing(name,
- escape_chars=('\t', '\r', '\n'))
- if name is not None else '')
+ level.append(
+ pprint_thing(name, escape_chars=("\t", "\r", "\n"))
+ if name is not None
+ else ""
+ )
level.extend(np.array(lev, dtype=object))
result_levels.append(level)
@@ -1069,20 +1151,22 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False,
sparsify = get_option("display.multi_sparse")
if sparsify:
- sentinel = ''
+ sentinel = ""
# GH3547
# use value of sparsify as sentinel, unless it's an obvious
# "Truthy" value
if sparsify not in [True, 1]:
sentinel = sparsify
# little bit of a kludge job for #1217
- result_levels = _sparsify(result_levels, start=int(names),
- sentinel=sentinel)
+ result_levels = _sparsify(
+ result_levels, start=int(names), sentinel=sentinel
+ )
if adjoin:
from pandas.io.formats.format import _get_adjustment
+
adj = _get_adjustment()
- return adj.adjoin(space, *result_levels).split('\n')
+ return adj.adjoin(space, *result_levels).split("\n")
else:
return result_levels
@@ -1122,14 +1206,15 @@ def _set_names(self, names, level=None, validate=True):
# GH 15110
# Don't allow a single string for names in a MultiIndex
if names is not None and not is_list_like(names):
- raise ValueError('Names should be list-like for a MultiIndex')
+ raise ValueError("Names should be list-like for a MultiIndex")
names = list(names)
if validate and level is not None and len(names) != len(level):
- raise ValueError('Length of names must match length of level.')
+ raise ValueError("Length of names must match length of level.")
if validate and level is None and len(names) != self.nlevels:
- raise ValueError('Length of names must match number of levels in '
- 'MultiIndex.')
+ raise ValueError(
+ "Length of names must match number of levels in " "MultiIndex."
+ )
if level is None:
level = range(self.nlevels)
@@ -1142,14 +1227,18 @@ def _set_names(self, names, level=None, validate=True):
# GH 20527
# All items in 'names' need to be hashable:
if not is_hashable(name):
- raise TypeError('{}.name must be a hashable type'
- .format(self.__class__.__name__))
+ raise TypeError(
+ "{}.name must be a hashable type".format(
+ self.__class__.__name__
+ )
+ )
self.levels[l].rename(name, inplace=True)
- names = property(fset=_set_names, fget=_get_names,
- doc="""\nNames of levels in MultiIndex\n""")
+ names = property(
+ fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex\n"""
+ )
- @Appender(_index_shared_docs['_get_grouper_for_level'])
+ @Appender(_index_shared_docs["_get_grouper_for_level"])
def _get_grouper_for_level(self, mapper, level):
indexer = self.codes[level]
level_index = self.levels[level]
@@ -1185,29 +1274,34 @@ def _constructor(self):
@cache_readonly
def inferred_type(self):
- return 'mixed'
+ return "mixed"
def _get_level_number(self, level):
count = self.names.count(level)
if (count > 1) and not is_integer(level):
- raise ValueError('The name %s occurs multiple times, use a '
- 'level number' % level)
+ raise ValueError(
+ "The name %s occurs multiple times, use a " "level number" % level
+ )
try:
level = self.names.index(level)
except ValueError:
if not is_integer(level):
- raise KeyError('Level %s not found' % str(level))
+ raise KeyError("Level %s not found" % str(level))
elif level < 0:
level += self.nlevels
if level < 0:
orig_level = level - self.nlevels
- raise IndexError('Too many levels: Index has only %d '
- 'levels, %d is not a valid level number' %
- (self.nlevels, orig_level))
+ raise IndexError(
+ "Too many levels: Index has only %d "
+ "levels, %d is not a valid level number"
+ % (self.nlevels, orig_level)
+ )
# Note: levels are zero-based
elif level >= self.nlevels:
- raise IndexError('Too many levels: Index has only %d levels, '
- 'not %d' % (self.nlevels, level + 1))
+ raise IndexError(
+ "Too many levels: Index has only %d levels, "
+ "not %d" % (self.nlevels, level + 1)
+ )
return level
_tuples = None
@@ -1226,7 +1320,7 @@ def _engine(self):
# equivalent to sorting lexicographically the codes themselves. Notice
# that each level needs to be shifted by the number of bits needed to
# represent the _previous_ ones:
- offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint64')
+ offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64")
# Check the total number of bits needed for our representation:
if lev_bits[0] > 64:
@@ -1245,8 +1339,7 @@ def values(self):
vals = self._get_level_values(i)
if is_categorical_dtype(vals):
vals = vals._internal_get_values()
- if (isinstance(vals.dtype, ExtensionDtype)
- or hasattr(vals, '_box_values')):
+ if isinstance(vals.dtype, ExtensionDtype) or hasattr(vals, "_box_values"):
vals = vals.astype(object)
vals = np.array(vals, copy=False)
values.append(vals)
@@ -1267,8 +1360,9 @@ def is_monotonic_increasing(self):
"""
# reversed() because lexsort() wants the most significant key last.
- values = [self._get_level_values(i).values
- for i in reversed(range(len(self.levels)))]
+ values = [
+ self._get_level_values(i).values for i in reversed(range(len(self.levels)))
+ ]
try:
sort_order = np.lexsort(values)
return Index(sort_order).is_monotonic
@@ -1289,7 +1383,7 @@ def is_monotonic_decreasing(self):
@cache_readonly
def _have_mixed_levels(self):
""" return a boolean list indicated if we have mixed levels """
- return ['mixed' in l for l in self._inferred_type_levels]
+ return ["mixed" in l for l in self._inferred_type_levels]
@cache_readonly
def _inferred_type_levels(self):
@@ -1300,6 +1394,7 @@ def _inferred_type_levels(self):
def _hashed_values(self):
""" return a uint64 ndarray of my hashed values """
from pandas.core.util.hashing import hash_tuples
+
return hash_tuples(self)
def _hashed_indexing_key(self, key):
@@ -1333,12 +1428,14 @@ def f(k, stringify):
if stringify and not isinstance(k, str):
k = str(k)
return k
- key = tuple(f(k, stringify)
- for k, stringify in zip(key, self._have_mixed_levels))
+
+ key = tuple(
+ f(k, stringify) for k, stringify in zip(key, self._have_mixed_levels)
+ )
return hash_tuple(key)
@Appender(Index.duplicated.__doc__)
- def duplicated(self, keep='first'):
+ def duplicated(self, keep="first"):
from pandas.core.sorting import get_group_index
from pandas._libs.hashtable import duplicated_int64
@@ -1351,14 +1448,14 @@ def fillna(self, value=None, downcast=None):
"""
fillna is not implemented for MultiIndex
"""
- raise NotImplementedError('isna is not defined for MultiIndex')
+ raise NotImplementedError("isna is not defined for MultiIndex")
- @Appender(_index_shared_docs['dropna'])
- def dropna(self, how='any'):
+ @Appender(_index_shared_docs["dropna"])
+ def dropna(self, how="any"):
nans = [level_codes == -1 for level_codes in self.codes]
- if how == 'any':
+ if how == "any":
indexer = np.any(nans, axis=0)
- elif how == 'all':
+ elif how == "all":
indexer = np.all(nans, axis=0)
else:
raise ValueError("invalid how option: {0}".format(how))
@@ -1380,8 +1477,9 @@ def _try_mi(k):
new_values = series._values[loc]
new_index = self[loc]
new_index = maybe_droplevels(new_index, k)
- return series._constructor(new_values, index=new_index,
- name=series.name).__finalize__(self)
+ return series._constructor(
+ new_values, index=new_index, name=series.name
+ ).__finalize__(self)
try:
return self._engine.get_value(s, k)
@@ -1419,8 +1517,13 @@ def _try_mi(k):
try:
return _try_mi(Timestamp(key))
- except (KeyError, TypeError,
- IndexError, ValueError, tslibs.OutOfBoundsDatetime):
+ except (
+ KeyError,
+ TypeError,
+ IndexError,
+ ValueError,
+ tslibs.OutOfBoundsDatetime,
+ ):
pass
raise InvalidIndexError(key)
@@ -1447,8 +1550,7 @@ def _get_level_values(self, level, unique=False):
level_codes = self.codes[level]
if unique:
level_codes = algos.unique(level_codes)
- filled = algos.take_1d(values._values, level_codes,
- fill_value=values._na_value)
+ filled = algos.take_1d(values._values, level_codes, fill_value=values._na_value)
values = values._shallow_copy(filled)
return values
@@ -1488,7 +1590,7 @@ def get_level_values(self, level):
values = self._get_level_values(level)
return values
- @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs)
def unique(self, level=None):
if level is None:
@@ -1528,26 +1630,31 @@ def to_frame(self, index=True, name=None):
"""
from pandas import DataFrame
+
if name is not None:
if not is_list_like(name):
- raise TypeError("'name' must be a list / sequence "
- "of column names.")
+ raise TypeError("'name' must be a list / sequence " "of column names.")
if len(name) != len(self.levels):
- raise ValueError("'name' should have same length as "
- "number of levels on index.")
+ raise ValueError(
+ "'name' should have same length as " "number of levels on index."
+ )
idx_names = name
else:
idx_names = self.names
# Guarantee resulting column order
result = DataFrame(
- OrderedDict([
- ((level if lvlname is None else lvlname),
- self._get_level_values(level))
- for lvlname, level in zip(idx_names, range(len(self.levels)))
- ]),
- copy=False
+ OrderedDict(
+ [
+ (
+ (level if lvlname is None else lvlname),
+ self._get_level_values(level),
+ )
+ for lvlname, level in zip(idx_names, range(len(self.levels)))
+ ]
+ ),
+ copy=False,
)
if index:
@@ -1598,14 +1705,16 @@ def to_hierarchical(self, n_repeat, n_shuffle=1):
)
"""
levels = self.levels
- codes = [np.repeat(level_codes, n_repeat) for
- level_codes in self.codes]
+ codes = [np.repeat(level_codes, n_repeat) for level_codes in self.codes]
# Assumes that each level_codes is divisible by n_shuffle
- codes = [x.reshape(n_shuffle, -1).ravel(order='F') for x in codes]
+ codes = [x.reshape(n_shuffle, -1).ravel(order="F") for x in codes]
names = self.names
- warnings.warn("Method .to_hierarchical is deprecated and will "
- "be removed in a future version",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "Method .to_hierarchical is deprecated and will "
+ "be removed in a future version",
+ FutureWarning,
+ stacklevel=2,
+ )
return MultiIndex(levels=levels, codes=codes, names=names)
def to_flat_index(self):
@@ -1728,9 +1837,13 @@ def _sort_levels_monotonic(self):
new_levels.append(lev)
new_codes.append(level_codes)
- return MultiIndex(new_levels, new_codes,
- names=self.names, sortorder=self.sortorder,
- verify_integrity=False)
+ return MultiIndex(
+ new_levels,
+ new_codes,
+ names=self.names,
+ sortorder=self.sortorder,
+ verify_integrity=False,
+ )
def remove_unused_levels(self):
"""
@@ -1835,19 +1948,22 @@ def levshape(self):
def __reduce__(self):
"""Necessary for making this object picklable"""
- d = dict(levels=[lev for lev in self.levels],
- codes=[level_codes for level_codes in self.codes],
- sortorder=self.sortorder, names=list(self.names))
+ d = dict(
+ levels=[lev for lev in self.levels],
+ codes=[level_codes for level_codes in self.codes],
+ sortorder=self.sortorder,
+ names=list(self.names),
+ )
return ibase._new_Index, (self.__class__, d), None
def __setstate__(self, state):
"""Necessary for making this object picklable"""
if isinstance(state, dict):
- levels = state.get('levels')
- codes = state.get('codes')
- sortorder = state.get('sortorder')
- names = state.get('names')
+ levels = state.get("levels")
+ codes = state.get("codes")
+ sortorder = state.get("sortorder")
+ names = state.get("names")
elif isinstance(state, tuple):
@@ -1887,30 +2003,40 @@ def __getitem__(self, key):
new_codes = [level_codes[key] for level_codes in self.codes]
- return MultiIndex(levels=self.levels, codes=new_codes,
- names=self.names, sortorder=sortorder,
- verify_integrity=False)
+ return MultiIndex(
+ levels=self.levels,
+ codes=new_codes,
+ names=self.names,
+ sortorder=sortorder,
+ verify_integrity=False,
+ )
- @Appender(_index_shared_docs['take'] % _index_doc_kwargs)
- def take(self, indices, axis=0, allow_fill=True,
- fill_value=None, **kwargs):
+ @Appender(_index_shared_docs["take"] % _index_doc_kwargs)
+ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs):
nv.validate_take(tuple(), kwargs)
indices = ensure_platform_int(indices)
- taken = self._assert_take_fillable(self.codes, indices,
- allow_fill=allow_fill,
- fill_value=fill_value,
- na_value=-1)
- return MultiIndex(levels=self.levels, codes=taken,
- names=self.names, verify_integrity=False)
-
- def _assert_take_fillable(self, values, indices, allow_fill=True,
- fill_value=None, na_value=None):
+ taken = self._assert_take_fillable(
+ self.codes,
+ indices,
+ allow_fill=allow_fill,
+ fill_value=fill_value,
+ na_value=-1,
+ )
+ return MultiIndex(
+ levels=self.levels, codes=taken, names=self.names, verify_integrity=False
+ )
+
+ def _assert_take_fillable(
+ self, values, indices, allow_fill=True, fill_value=None, na_value=None
+ ):
""" Internal method to handle NA filling of take """
# only fill if we are passing a non-None fill_value
if allow_fill and fill_value is not None:
if (indices < -1).any():
- msg = ('When allow_fill=True and fill_value is not None, '
- 'all indices must be >= -1')
+ msg = (
+ "When allow_fill=True and fill_value is not None, "
+ "all indices must be >= -1"
+ )
raise ValueError(msg)
taken = [lab.take(indices) for lab in self.codes]
mask = indices == -1
@@ -1940,8 +2066,9 @@ def append(self, other):
if not isinstance(other, (list, tuple)):
other = [other]
- if all((isinstance(o, MultiIndex) and o.nlevels >= self.nlevels)
- for o in other):
+ if all(
+ (isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other
+ ):
arrays = []
for i in range(self.nlevels):
label = self._get_level_values(i)
@@ -1949,7 +2076,7 @@ def append(self, other):
arrays.append(label.append(appended))
return MultiIndex.from_arrays(arrays, names=self.names)
- to_concat = (self.values, ) + tuple(k._values for k in other)
+ to_concat = (self.values,) + tuple(k._values for k in other)
new_tuples = np.concatenate(to_concat)
# if all(isinstance(x, MultiIndex) for x in other):
@@ -1961,21 +2088,27 @@ def append(self, other):
def argsort(self, *args, **kwargs):
return self.values.argsort(*args, **kwargs)
- @Appender(_index_shared_docs['repeat'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs)
def repeat(self, repeats, axis=None):
nv.validate_repeat(tuple(), dict(axis=axis))
- return MultiIndex(levels=self.levels,
- codes=[level_codes.view(np.ndarray).repeat(repeats)
- for level_codes in self.codes],
- names=self.names, sortorder=self.sortorder,
- verify_integrity=False)
+ return MultiIndex(
+ levels=self.levels,
+ codes=[
+ level_codes.view(np.ndarray).repeat(repeats)
+ for level_codes in self.codes
+ ],
+ names=self.names,
+ sortorder=self.sortorder,
+ verify_integrity=False,
+ )
def where(self, cond, other=None):
- raise NotImplementedError(".where is not supported for "
- "MultiIndex operations")
+ raise NotImplementedError(
+ ".where is not supported for " "MultiIndex operations"
+ )
- @deprecate_kwarg(old_arg_name='labels', new_arg_name='codes')
- def drop(self, codes, level=None, errors='raise'):
+ @deprecate_kwarg(old_arg_name="labels", new_arg_name="codes")
+ def drop(self, codes, level=None, errors="raise"):
"""
Make new MultiIndex with passed list of codes deleted
@@ -1998,9 +2131,8 @@ def drop(self, codes, level=None, errors='raise'):
indexer = self.get_indexer(codes)
mask = indexer == -1
if mask.any():
- if errors != 'ignore':
- raise ValueError('codes %s not contained in axis' %
- codes[mask])
+ if errors != "ignore":
+ raise ValueError("codes %s not contained in axis" % codes[mask])
except Exception:
pass
@@ -2016,18 +2148,20 @@ def drop(self, codes, level=None, errors='raise'):
inds.extend(range(loc.start, loc.stop))
elif com.is_bool_indexer(loc):
if self.lexsort_depth == 0:
- warnings.warn('dropping on a non-lexsorted multi-index'
- ' without a level parameter may impact '
- 'performance.',
- PerformanceWarning,
- stacklevel=3)
+ warnings.warn(
+ "dropping on a non-lexsorted multi-index"
+ " without a level parameter may impact "
+ "performance.",
+ PerformanceWarning,
+ stacklevel=3,
+ )
loc = loc.nonzero()[0]
inds.extend(loc)
else:
- msg = 'unsupported indexer of type {}'.format(type(loc))
+ msg = "unsupported indexer of type {}".format(type(loc))
raise AssertionError(msg)
except KeyError:
- if errors != 'ignore':
+ if errors != "ignore":
raise
return self.delete(inds)
@@ -2101,8 +2235,9 @@ def swaplevel(self, i=-2, j=-1):
new_codes[i], new_codes[j] = new_codes[j], new_codes[i]
new_names[i], new_names[j] = new_names[j], new_names[i]
- return MultiIndex(levels=new_levels, codes=new_codes,
- names=new_names, verify_integrity=False)
+ return MultiIndex(
+ levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
+ )
def reorder_levels(self, order):
"""
@@ -2117,15 +2252,17 @@ def reorder_levels(self, order):
"""
order = [self._get_level_number(i) for i in order]
if len(order) != self.nlevels:
- raise AssertionError('Length of order must be same as '
- 'number of levels (%d), got %d' %
- (self.nlevels, len(order)))
+ raise AssertionError(
+ "Length of order must be same as "
+ "number of levels (%d), got %d" % (self.nlevels, len(order))
+ )
new_levels = [self.levels[i] for i in order]
new_codes = [self.codes[i] for i in order]
new_names = [self.names[i] for i in order]
- return MultiIndex(levels=new_levels, codes=new_codes,
- names=new_names, verify_integrity=False)
+ return MultiIndex(
+ levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
+ )
def __getslice__(self, i, j):
return self.__getitem__(slice(i, j))
@@ -2141,13 +2278,15 @@ def _get_codes_for_sorting(self):
from pandas.core.arrays import Categorical
def cats(level_codes):
- return np.arange(np.array(level_codes).max() + 1 if
- len(level_codes) else 0,
- dtype=level_codes.dtype)
+ return np.arange(
+ np.array(level_codes).max() + 1 if len(level_codes) else 0,
+ dtype=level_codes.dtype,
+ )
- return [Categorical.from_codes(level_codes, cats(level_codes),
- ordered=True)
- for level_codes in self.codes]
+ return [
+ Categorical.from_codes(level_codes, cats(level_codes), ordered=True)
+ for level_codes in self.codes
+ ]
def sortlevel(self, level=0, ascending=True, sort_remaining=True):
"""
@@ -2184,8 +2323,10 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True):
raise ValueError("level must have same length as ascending")
from pandas.core.sorting import lexsort_indexer
- indexer = lexsort_indexer([self.codes[lev] for lev in level],
- orders=ascending)
+
+ indexer = lexsort_indexer(
+ [self.codes[lev] for lev in level], orders=ascending
+ )
# level ordering
else:
@@ -2209,8 +2350,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True):
else:
sortorder = level[0]
- indexer = indexer_from_factorized(primary, primshp,
- compress=False)
+ indexer = indexer_from_factorized(primary, primshp, compress=False)
if not ascending:
indexer = indexer[::-1]
@@ -2218,9 +2358,13 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True):
indexer = ensure_platform_int(indexer)
new_codes = [level_codes.take(indexer) for level_codes in self.codes]
- new_index = MultiIndex(codes=new_codes, levels=self.levels,
- names=self.names, sortorder=sortorder,
- verify_integrity=False)
+ new_index = MultiIndex(
+ codes=new_codes,
+ levels=self.levels,
+ names=self.names,
+ sortorder=sortorder,
+ verify_integrity=False,
+ )
return new_index, indexer
@@ -2240,8 +2384,7 @@ def _convert_listlike_indexer(self, keyarr, kind=None):
indexer, keyarr = super()._convert_listlike_indexer(keyarr, kind=kind)
# are we indexing a specific level
- if indexer is None and len(keyarr) and not isinstance(keyarr[0],
- tuple):
+ if indexer is None and len(keyarr) and not isinstance(keyarr[0], tuple):
level = 0
_, indexer = self.reindex(keyarr, level=level)
@@ -2252,11 +2395,11 @@ def _convert_listlike_indexer(self, keyarr, kind=None):
check = self.levels[0].get_indexer(keyarr)
mask = check == -1
if mask.any():
- raise KeyError('%s not in index' % keyarr[mask])
+ raise KeyError("%s not in index" % keyarr[mask])
return indexer, keyarr
- @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def get_indexer(self, target, method=None, limit=None, tolerance=None):
method = missing.clean_reindex_fill_method(method)
target = ensure_index(target)
@@ -2272,34 +2415,36 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
# let's instead try with a straight Index
if method is None:
- return Index(self.values).get_indexer(target,
- method=method,
- limit=limit,
- tolerance=tolerance)
+ return Index(self.values).get_indexer(
+ target, method=method, limit=limit, tolerance=tolerance
+ )
if not self.is_unique:
- raise ValueError('Reindexing only valid with uniquely valued '
- 'Index objects')
+ raise ValueError(
+ "Reindexing only valid with uniquely valued " "Index objects"
+ )
- if method == 'pad' or method == 'backfill':
+ if method == "pad" or method == "backfill":
if tolerance is not None:
- raise NotImplementedError("tolerance not implemented yet "
- 'for MultiIndex')
+ raise NotImplementedError(
+ "tolerance not implemented yet " "for MultiIndex"
+ )
indexer = self._engine.get_indexer(target, method, limit)
- elif method == 'nearest':
- raise NotImplementedError("method='nearest' not implemented yet "
- 'for MultiIndex; see GitHub issue 9365')
+ elif method == "nearest":
+ raise NotImplementedError(
+ "method='nearest' not implemented yet "
+ "for MultiIndex; see GitHub issue 9365"
+ )
else:
indexer = self._engine.get_indexer(target)
return ensure_platform_int(indexer)
- @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
def get_indexer_non_unique(self, target):
return super().get_indexer_non_unique(target)
- def reindex(self, target, method=None, level=None, limit=None,
- tolerance=None):
+ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
"""
Create index with target's values (move/add/delete values as necessary)
@@ -2313,11 +2458,11 @@ def reindex(self, target, method=None, level=None, limit=None,
"""
# GH6552: preserve names when reindexing to non-named target
# (i.e. neither Index nor Series).
- preserve_names = not hasattr(target, 'names')
+ preserve_names = not hasattr(target, "names")
if level is not None:
if method is not None:
- raise TypeError('Fill method not supported if level passed')
+ raise TypeError("Fill method not supported if level passed")
# GH7774: preserve dtype/tz if target is empty and not an Index.
# target may be an iterator
@@ -2325,23 +2470,22 @@ def reindex(self, target, method=None, level=None, limit=None,
if len(target) == 0 and not isinstance(target, Index):
idx = self.levels[level]
attrs = idx._get_attributes_dict()
- attrs.pop('freq', None) # don't preserve freq
- target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype),
- **attrs)
+ attrs.pop("freq", None) # don't preserve freq
+ target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype), **attrs)
else:
target = ensure_index(target)
- target, indexer, _ = self._join_level(target, level, how='right',
- return_indexers=True,
- keep_order=False)
+ target, indexer, _ = self._join_level(
+ target, level, how="right", return_indexers=True, keep_order=False
+ )
else:
target = ensure_index(target)
if self.equals(target):
indexer = None
else:
if self.is_unique:
- indexer = self.get_indexer(target, method=method,
- limit=limit,
- tolerance=tolerance)
+ indexer = self.get_indexer(
+ target, method=method, limit=limit, tolerance=tolerance
+ )
else:
raise ValueError("cannot handle a non-unique multi-index!")
@@ -2354,8 +2498,11 @@ def reindex(self, target, method=None, level=None, limit=None,
# hopefully?
target = MultiIndex.from_tuples(target)
- if (preserve_names and target.nlevels == self.nlevels and
- target.names != self.names):
+ if (
+ preserve_names
+ and target.nlevels == self.nlevels
+ and target.names != self.names
+ ):
target = target.copy(deep=False)
target.names = self.names
@@ -2364,7 +2511,7 @@ def reindex(self, target, method=None, level=None, limit=None,
def get_slice_bound(self, label, side, kind):
if not isinstance(label, tuple):
- label = label,
+ label = (label,)
return self._partial_tup_index(label, side=side)
def slice_locs(self, start=None, end=None, step=None, kind=None):
@@ -2423,12 +2570,12 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
# happens in get_slice_bound method), but it adds meaningful doc.
return super().slice_locs(start, end, step, kind=kind)
- def _partial_tup_index(self, tup, side='left'):
+ def _partial_tup_index(self, tup, side="left"):
if len(tup) > self.lexsort_depth:
raise UnsortedIndexError(
- 'Key length (%d) was greater than MultiIndex'
- ' lexsort depth (%d)' %
- (len(tup), self.lexsort_depth))
+ "Key length (%d) was greater than MultiIndex"
+ " lexsort depth (%d)" % (len(tup), self.lexsort_depth)
+ )
n = len(tup)
start, end = 0, len(self)
@@ -2437,20 +2584,19 @@ def _partial_tup_index(self, tup, side='left'):
section = labs[start:end]
if lab not in lev:
- if not lev.is_type_compatible(lib.infer_dtype([lab],
- skipna=False)):
- raise TypeError('Level type mismatch: %s' % lab)
+ if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)):
+ raise TypeError("Level type mismatch: %s" % lab)
# short circuit
loc = lev.searchsorted(lab, side=side)
- if side == 'right' and loc >= 0:
+ if side == "right" and loc >= 0:
loc -= 1
return start + section.searchsorted(loc, side=side)
idx = lev.get_loc(lab)
if k < n - 1:
- end = start + section.searchsorted(idx, side='right')
- start = start + section.searchsorted(idx, side='left')
+ end = start + section.searchsorted(idx, side="right")
+ start = start + section.searchsorted(idx, side="left")
else:
return start + section.searchsorted(idx, side=side)
@@ -2495,19 +2641,21 @@ def get_loc(self, key, method=None):
1
"""
if method is not None:
- raise NotImplementedError('only the default get_loc method is '
- 'currently supported for MultiIndex')
+ raise NotImplementedError(
+ "only the default get_loc method is "
+ "currently supported for MultiIndex"
+ )
def _maybe_to_slice(loc):
"""convert integer indexer to boolean mask or slice if possible"""
- if not isinstance(loc, np.ndarray) or loc.dtype != 'int64':
+ if not isinstance(loc, np.ndarray) or loc.dtype != "int64":
return loc
loc = lib.maybe_indices_to_slice(loc, len(self))
if isinstance(loc, slice):
return loc
- mask = np.empty(len(self), dtype='bool')
+ mask = np.empty(len(self), dtype="bool")
mask.fill(False)
mask[loc] = True
return mask
@@ -2518,8 +2666,10 @@ def _maybe_to_slice(loc):
keylen = len(key)
if self.nlevels < keylen:
- raise KeyError('Key length ({0}) exceeds index depth ({1})'
- ''.format(keylen, self.nlevels))
+ raise KeyError(
+ "Key length ({0}) exceeds index depth ({1})"
+ "".format(keylen, self.nlevels)
+ )
if keylen == self.nlevels and self.is_unique:
return self._engine.get_loc(key)
@@ -2530,8 +2680,9 @@ def _maybe_to_slice(loc):
# needs linear search within the slice
i = self.lexsort_depth
lead_key, follow_key = key[:i], key[i:]
- start, stop = (self.slice_locs(lead_key, lead_key)
- if lead_key else (0, len(self)))
+ start, stop = (
+ self.slice_locs(lead_key, lead_key) if lead_key else (0, len(self))
+ )
if start == stop:
raise KeyError(key)
@@ -2539,10 +2690,13 @@ def _maybe_to_slice(loc):
if not follow_key:
return slice(start, stop)
- warnings.warn('indexing past lexsort depth may impact performance.',
- PerformanceWarning, stacklevel=10)
+ warnings.warn(
+ "indexing past lexsort depth may impact performance.",
+ PerformanceWarning,
+ stacklevel=10,
+ )
- loc = np.arange(start, stop, dtype='int64')
+ loc = np.arange(start, stop, dtype="int64")
for i, k in enumerate(follow_key, len(lead_key)):
mask = self.codes[i][loc] == self.levels[i].get_loc(k)
@@ -2551,8 +2705,7 @@ def _maybe_to_slice(loc):
if not len(loc):
raise KeyError(key)
- return (_maybe_to_slice(loc) if len(loc) != stop - start else
- slice(start, stop))
+ return _maybe_to_slice(loc) if len(loc) != stop - start else slice(start, stop)
def get_loc_level(self, key, level=0, drop_level=True):
"""
@@ -2612,8 +2765,9 @@ def maybe_droplevels(indexer, levels, drop_level):
if isinstance(level, (tuple, list)):
if len(key) != len(level):
- raise AssertionError('Key for location must have same '
- 'length as number of levels')
+ raise AssertionError(
+ "Key for location must have same " "length as number of levels"
+ )
result = None
for lev, k in zip(level, key):
loc, new_index = self.get_loc_level(k, level=lev)
@@ -2649,10 +2803,10 @@ def maybe_droplevels(indexer, levels, drop_level):
def partial_selection(key, indexer=None):
if indexer is None:
indexer = self.get_loc(key)
- ilevels = [i for i in range(len(key))
- if key[i] != slice(None, None)]
- return indexer, maybe_droplevels(indexer, ilevels,
- drop_level)
+ ilevels = [
+ i for i in range(len(key)) if key[i] != slice(None, None)
+ ]
+ return indexer, maybe_droplevels(indexer, ilevels, drop_level)
if len(key) == self.nlevels and self.is_unique:
# Complete key in unique index -> standard get_loc
@@ -2683,8 +2837,7 @@ def partial_selection(key, indexer=None):
indexer &= k_index
if indexer is None:
indexer = slice(None, None)
- ilevels = [i for i in range(len(key))
- if key[i] != slice(None, None)]
+ ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)]
return indexer, maybe_droplevels(indexer, ilevels, drop_level)
else:
indexer = self._get_level_indexer(key, level=level)
@@ -2698,8 +2851,7 @@ def _get_level_indexer(self, key, level=0, indexer=None):
level_index = self.levels[level]
level_codes = self.codes[level]
- def convert_indexer(start, stop, step, indexer=indexer,
- codes=level_codes):
+ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
# given the inputs and the codes/indexer, compute an indexer set
# if we have a provided indexer, then this need not consider
# the entire labels set
@@ -2714,6 +2866,7 @@ def convert_indexer(start, stop, step, indexer=indexer,
# that the result are the mappings to the set that we have
# selected
from pandas import Series
+
mapper = Series(indexer)
indexer = codes.take(ensure_platform_int(indexer))
result = Series(Index(indexer).isin(r).nonzero()[0])
@@ -2721,8 +2874,7 @@ def convert_indexer(start, stop, step, indexer=indexer,
else:
m = np.zeros(len(codes), dtype=bool)
- m[np.in1d(codes, r,
- assume_unique=Index(codes).is_unique)] = True
+ m[np.in1d(codes, r, assume_unique=Index(codes).is_unique)] = True
return m
@@ -2744,8 +2896,9 @@ def convert_indexer(start, stop, step, indexer=indexer,
# we have a partial slice (like looking up a partial date
# string)
- start = stop = level_index.slice_indexer(key.start, key.stop,
- key.step, kind='loc')
+ start = stop = level_index.slice_indexer(
+ key.start, key.stop, key.step, kind="loc"
+ )
step = start.step
if isinstance(start, slice) or isinstance(stop, slice):
@@ -2753,8 +2906,8 @@ def convert_indexer(start, stop, step, indexer=indexer,
# a partial date slicer on a DatetimeIndex generates a slice
# note that the stop ALREADY includes the stopped point (if
# it was a string sliced)
- start = getattr(start, 'start', start)
- stop = getattr(stop, 'stop', stop)
+ start = getattr(start, "start", start)
+ stop = getattr(stop, "stop", stop)
return convert_indexer(start, stop, step)
elif level > 0 or self.lexsort_depth == 0 or step is not None:
@@ -2764,8 +2917,8 @@ def convert_indexer(start, stop, step, indexer=indexer,
return convert_indexer(start, stop + 1, step)
else:
# sorted, so can return slice object -> view
- i = level_codes.searchsorted(start, side='left')
- j = level_codes.searchsorted(stop, side='right')
+ i = level_codes.searchsorted(start, side="left")
+ j = level_codes.searchsorted(stop, side="right")
return slice(i, j, step)
else:
@@ -2780,8 +2933,8 @@ def convert_indexer(start, stop, step, indexer=indexer,
raise KeyError(key)
return locs
- i = level_codes.searchsorted(code, side='left')
- j = level_codes.searchsorted(code, side='right')
+ i = level_codes.searchsorted(code, side="left")
+ j = level_codes.searchsorted(code, side="right")
if i == j:
# The label is present in self.levels[level] but unused:
raise KeyError(key)
@@ -2826,10 +2979,11 @@ def get_locs(self, seq):
# must be lexsorted to at least as many levels
true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s]
if true_slices and true_slices[-1] >= self.lexsort_depth:
- raise UnsortedIndexError('MultiIndex slicing requires the index '
- 'to be lexsorted: slicing on levels {0}, '
- 'lexsort depth {1}'
- .format(true_slices, self.lexsort_depth))
+ raise UnsortedIndexError(
+ "MultiIndex slicing requires the index "
+ "to be lexsorted: slicing on levels {0}, "
+ "lexsort depth {1}".format(true_slices, self.lexsort_depth)
+ )
# indexer
# this is the list of all values that we want to select
n = len(self)
@@ -2843,9 +2997,11 @@ def _convert_to_indexer(r):
r = m.nonzero()[0]
elif com.is_bool_indexer(r):
if len(r) != n:
- raise ValueError("cannot index with a boolean indexer "
- "that is not the same length as the "
- "index")
+ raise ValueError(
+ "cannot index with a boolean indexer "
+ "that is not the same length as the "
+ "index"
+ )
r = r.nonzero()[0]
return Int64Index(r)
@@ -2861,8 +3017,7 @@ def _update_indexer(idxr, indexer=indexer):
if com.is_bool_indexer(k):
# a boolean indexer, must be the same length!
k = np.asarray(k)
- indexer = _update_indexer(_convert_to_indexer(k),
- indexer=indexer)
+ indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer)
elif is_list_like(k):
# a collection of labels to include from this level (these
@@ -2871,10 +3026,9 @@ def _update_indexer(idxr, indexer=indexer):
for x in k:
try:
idxrs = _convert_to_indexer(
- self._get_level_indexer(x, level=i,
- indexer=indexer))
- indexers = (idxrs if indexers is None
- else indexers | idxrs)
+ self._get_level_indexer(x, level=i, indexer=indexer)
+ )
+ indexers = idxrs if indexers is None else indexers | idxrs
except KeyError:
# ignore not founds
@@ -2893,14 +3047,20 @@ def _update_indexer(idxr, indexer=indexer):
elif isinstance(k, slice):
# a slice, include BOTH of the labels
- indexer = _update_indexer(_convert_to_indexer(
- self._get_level_indexer(k, level=i, indexer=indexer)),
- indexer=indexer)
+ indexer = _update_indexer(
+ _convert_to_indexer(
+ self._get_level_indexer(k, level=i, indexer=indexer)
+ ),
+ indexer=indexer,
+ )
else:
# a single label
- indexer = _update_indexer(_convert_to_indexer(
- self.get_loc_level(k, level=i, drop_level=False)[0]),
- indexer=indexer)
+ indexer = _update_indexer(
+ _convert_to_indexer(
+ self.get_loc_level(k, level=i, drop_level=False)[0]
+ ),
+ indexer=indexer,
+ )
# empty indexer
if indexer is None:
@@ -2923,7 +3083,7 @@ def truncate(self, before=None, after=None):
truncated : MultiIndex
"""
if after and before and after < before:
- raise ValueError('after < before')
+ raise ValueError("after < before")
i, j = self.levels[0].slice_locs(before, after)
left, right = self.slice_locs(before, after)
@@ -2934,8 +3094,7 @@ def truncate(self, before=None, after=None):
new_codes = [level_codes[left:right] for level_codes in self.codes]
new_codes[0] = new_codes[0] - i
- return MultiIndex(levels=new_levels, codes=new_codes,
- verify_integrity=False)
+ return MultiIndex(levels=new_levels, codes=new_codes, verify_integrity=False)
def equals(self, other):
"""
@@ -2965,14 +3124,15 @@ def equals(self, other):
for i in range(self.nlevels):
self_codes = self.codes[i]
self_codes = self_codes[self_codes != -1]
- self_values = algos.take_nd(np.asarray(self.levels[i]._values),
- self_codes, allow_fill=False)
+ self_values = algos.take_nd(
+ np.asarray(self.levels[i]._values), self_codes, allow_fill=False
+ )
other_codes = other.codes[i]
other_codes = other_codes[other_codes != -1]
other_values = algos.take_nd(
- np.asarray(other.levels[i]._values),
- other_codes, allow_fill=False)
+ np.asarray(other.levels[i]._values), other_codes, allow_fill=False
+ )
# since we use NaT both datetime64 and timedelta64
# we can have a situation where a level is typed say
@@ -3041,12 +3201,13 @@ def union(self, other, sort=None):
# TODO: Index.union returns other when `len(self)` is 0.
- uniq_tuples = lib.fast_unique_multiple([self._ndarray_values,
- other._ndarray_values],
- sort=sort)
+ uniq_tuples = lib.fast_unique_multiple(
+ [self._ndarray_values, other._ndarray_values], sort=sort
+ )
- return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0,
- names=result_names)
+ return MultiIndex.from_arrays(
+ zip(*uniq_tuples), sortorder=0, names=result_names
+ )
def intersection(self, other, sort=False):
"""
@@ -3084,12 +3245,16 @@ def intersection(self, other, sort=False):
uniq_tuples = sorted(uniq_tuples)
if len(uniq_tuples) == 0:
- return MultiIndex(levels=self.levels,
- codes=[[]] * self.nlevels,
- names=result_names, verify_integrity=False)
+ return MultiIndex(
+ levels=self.levels,
+ codes=[[]] * self.nlevels,
+ names=result_names,
+ verify_integrity=False,
+ )
else:
- return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0,
- names=result_names)
+ return MultiIndex.from_arrays(
+ zip(*uniq_tuples), sortorder=0, names=result_names
+ )
def difference(self, other, sort=None):
"""
@@ -3120,38 +3285,43 @@ def difference(self, other, sort=None):
return self
if self.equals(other):
- return MultiIndex(levels=self.levels,
- codes=[[]] * self.nlevels,
- names=result_names, verify_integrity=False)
+ return MultiIndex(
+ levels=self.levels,
+ codes=[[]] * self.nlevels,
+ names=result_names,
+ verify_integrity=False,
+ )
this = self._get_unique_index()
indexer = this.get_indexer(other)
indexer = indexer.take((indexer != -1).nonzero()[0])
- label_diff = np.setdiff1d(np.arange(this.size), indexer,
- assume_unique=True)
+ label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True)
difference = this.values.take(label_diff)
if sort is None:
difference = sorted(difference)
if len(difference) == 0:
- return MultiIndex(levels=[[]] * self.nlevels,
- codes=[[]] * self.nlevels,
- names=result_names, verify_integrity=False)
+ return MultiIndex(
+ levels=[[]] * self.nlevels,
+ codes=[[]] * self.nlevels,
+ names=result_names,
+ verify_integrity=False,
+ )
else:
- return MultiIndex.from_tuples(difference, sortorder=0,
- names=result_names)
+ return MultiIndex.from_tuples(difference, sortorder=0, names=result_names)
- @Appender(_index_shared_docs['astype'])
+ @Appender(_index_shared_docs["astype"])
def astype(self, dtype, copy=True):
dtype = pandas_dtype(dtype)
if is_categorical_dtype(dtype):
- msg = '> 1 ndim Categorical are not supported at this time'
+ msg = "> 1 ndim Categorical are not supported at this time"
raise NotImplementedError(msg)
elif not is_object_dtype(dtype):
- msg = ('Setting {cls} dtype to anything other than object '
- 'is not supported').format(cls=self.__class__)
+ msg = (
+ "Setting {cls} dtype to anything other than object " "is not supported"
+ ).format(cls=self.__class__)
raise TypeError(msg)
elif copy is True:
return self._shallow_copy()
@@ -3160,13 +3330,15 @@ def astype(self, dtype, copy=True):
def _convert_can_do_setop(self, other):
result_names = self.names
- if not hasattr(other, 'names'):
+ if not hasattr(other, "names"):
if len(other) == 0:
- other = MultiIndex(levels=[[]] * self.nlevels,
- codes=[[]] * self.nlevels,
- verify_integrity=False)
+ other = MultiIndex(
+ levels=[[]] * self.nlevels,
+ codes=[[]] * self.nlevels,
+ verify_integrity=False,
+ )
else:
- msg = 'other must be a MultiIndex or a list of tuples'
+ msg = "other must be a MultiIndex or a list of tuples"
try:
other = MultiIndex.from_tuples(other)
except TypeError:
@@ -3192,10 +3364,9 @@ def insert(self, loc, item):
# Pad the key with empty strings if lower levels of the key
# aren't specified:
if not isinstance(item, tuple):
- item = (item, ) + ('', ) * (self.nlevels - 1)
+ item = (item,) + ("",) * (self.nlevels - 1)
elif len(item) != self.nlevels:
- raise ValueError('Item must have length equal to number of '
- 'levels.')
+ raise ValueError("Item must have length equal to number of " "levels.")
new_levels = []
new_codes = []
@@ -3210,11 +3381,11 @@ def insert(self, loc, item):
lev_loc = level.get_loc(k)
new_levels.append(level)
- new_codes.append(np.insert(
- ensure_int64(level_codes), loc, lev_loc))
+ new_codes.append(np.insert(ensure_int64(level_codes), loc, lev_loc))
- return MultiIndex(levels=new_levels, codes=new_codes,
- names=self.names, verify_integrity=False)
+ return MultiIndex(
+ levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False
+ )
def delete(self, loc):
"""
@@ -3225,8 +3396,12 @@ def delete(self, loc):
new_index : MultiIndex
"""
new_codes = [np.delete(level_codes, loc) for level_codes in self.codes]
- return MultiIndex(levels=self.levels, codes=new_codes,
- names=self.names, verify_integrity=False)
+ return MultiIndex(
+ levels=self.levels,
+ codes=new_codes,
+ names=self.names,
+ verify_integrity=False,
+ )
def _wrap_joined_index(self, joined, other):
names = self.names if self.names == other.names else None
@@ -3235,8 +3410,7 @@ def _wrap_joined_index(self, joined, other):
@Appender(Index.isin.__doc__)
def isin(self, values, level=None):
if level is None:
- values = MultiIndex.from_tuples(values,
- names=self.names).values
+ values = MultiIndex.from_tuples(values, names=self.names).values
return algos.isin(self.values, values)
else:
num = self._get_level_number(level)
@@ -3255,14 +3429,14 @@ def isin(self, values, level=None):
MultiIndex._add_logical_methods_disabled()
-def _sparsify(label_list, start=0, sentinel=''):
+def _sparsify(label_list, start=0, sentinel=""):
pivoted = list(zip(*label_list))
k = len(label_list)
- result = pivoted[:start + 1]
+ result = pivoted[: start + 1]
prev = pivoted[start]
- for cur in pivoted[start + 1:]:
+ for cur in pivoted[start + 1 :]:
sparse_cur = []
for i, (p, t) in enumerate(zip(prev, cur)):
@@ -3284,4 +3458,4 @@ def _sparsify(label_list, start=0, sentinel=''):
def _get_na_rep(dtype):
- return {np.datetime64: 'NaT', np.timedelta64: 'NaT'}.get(dtype, 'NaN')
+ return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype, "NaN")
diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py
index 5f9c1f22887cc..daf26d53aa6e2 100644
--- a/pandas/core/indexes/numeric.py
+++ b/pandas/core/indexes/numeric.py
@@ -6,18 +6,29 @@
from pandas.util._decorators import Appender, cache_readonly
from pandas.core.dtypes.common import (
- is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float,
- is_float_dtype, is_integer_dtype, is_scalar, needs_i8_conversion,
- pandas_dtype)
+ is_bool,
+ is_bool_dtype,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_float,
+ is_float_dtype,
+ is_integer_dtype,
+ is_scalar,
+ needs_i8_conversion,
+ pandas_dtype,
+)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.generic import (
- ABCFloat64Index, ABCInt64Index, ABCRangeIndex, ABCUInt64Index)
+ ABCFloat64Index,
+ ABCInt64Index,
+ ABCRangeIndex,
+ ABCUInt64Index,
+)
from pandas.core.dtypes.missing import isna
from pandas.core import algorithms
import pandas.core.common as com
-from pandas.core.indexes.base import (
- Index, InvalidIndexError, _index_shared_docs)
+from pandas.core.indexes.base import Index, InvalidIndexError, _index_shared_docs
from pandas.core.ops import get_op_result_name
_num_index_shared_docs = dict()
@@ -30,15 +41,18 @@ class NumericIndex(Index):
This is an abstract class
"""
+
_is_numeric_dtype = True
- def __new__(cls, data=None, dtype=None, copy=False, name=None,
- fastpath=None):
+ def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None):
if fastpath is not None:
- warnings.warn("The 'fastpath' keyword is deprecated, and will be "
- "removed in a future version.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "The 'fastpath' keyword is deprecated, and will be "
+ "removed in a future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
if fastpath:
return cls._simple_new(data, name=name)
@@ -54,18 +68,18 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
else:
subarr = data
- if name is None and hasattr(data, 'name'):
+ if name is None and hasattr(data, "name"):
name = data.name
return cls._simple_new(subarr, name=name)
- @Appender(_index_shared_docs['_maybe_cast_slice_bound'])
+ @Appender(_index_shared_docs["_maybe_cast_slice_bound"])
def _maybe_cast_slice_bound(self, label, side, kind):
- assert kind in ['ix', 'loc', 'getitem', None]
+ assert kind in ["ix", "loc", "getitem", None]
# we will try to coerce to integers
return self._maybe_cast_indexer(label)
- @Appender(_index_shared_docs['_shallow_copy'])
+ @Appender(_index_shared_docs["_shallow_copy"])
def _shallow_copy(self, values=None, **kwargs):
if values is not None and not self._can_hold_na:
# Ensure we are not returning an Int64Index with float data:
@@ -85,17 +99,24 @@ def _convert_for_op(self, value):
def _convert_tolerance(self, tolerance, target):
tolerance = np.asarray(tolerance)
if target.size != tolerance.size and tolerance.size > 1:
- raise ValueError('list-like tolerance size must match '
- 'target index size')
+ raise ValueError("list-like tolerance size must match " "target index size")
if not np.issubdtype(tolerance.dtype, np.number):
if tolerance.ndim > 0:
- raise ValueError(('tolerance argument for %s must contain '
- 'numeric elements if it is list type') %
- (type(self).__name__,))
+ raise ValueError(
+ (
+ "tolerance argument for %s must contain "
+ "numeric elements if it is list type"
+ )
+ % (type(self).__name__,)
+ )
else:
- raise ValueError(('tolerance argument for %s must be numeric '
- 'if it is a scalar: %r') %
- (type(self).__name__, tolerance))
+ raise ValueError(
+ (
+ "tolerance argument for %s must be numeric "
+ "if it is a scalar: %r"
+ )
+ % (type(self).__name__, tolerance)
+ )
return tolerance
@classmethod
@@ -131,9 +152,8 @@ def _union(self, other, sort):
# float | [u]int -> float (the special case)
# | -> T
# | -> object
- needs_cast = (
- (is_integer_dtype(self.dtype) and is_float_dtype(other.dtype)) or
- (is_integer_dtype(other.dtype) and is_float_dtype(self.dtype))
+ needs_cast = (is_integer_dtype(self.dtype) and is_float_dtype(other.dtype)) or (
+ is_integer_dtype(other.dtype) and is_float_dtype(self.dtype)
)
if needs_cast:
first = self.astype("float")
@@ -143,7 +163,9 @@ def _union(self, other, sort):
return super()._union(other, sort)
-_num_index_shared_docs['class_descr'] = """
+_num_index_shared_docs[
+ "class_descr"
+] = """
Immutable ndarray implementing an ordered, sliceable set. The basic object
storing axis labels for all pandas objects. %(klass)s is a special case
of `Index` with purely %(ltype)s labels. %(extra)s
@@ -174,12 +196,7 @@ def _union(self, other, sort):
An Index instance can **only** contain hashable objects.
"""
-_int64_descr_args = dict(
- klass='Int64Index',
- ltype='integer',
- dtype='int64',
- extra=''
-)
+_int64_descr_args = dict(klass="Int64Index", ltype="integer", dtype="int64", extra="")
class IntegerIndex(NumericIndex):
@@ -201,9 +218,9 @@ def __contains__(self, key):
class Int64Index(IntegerIndex):
- __doc__ = _num_index_shared_docs['class_descr'] % _int64_descr_args
+ __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args
- _typ = 'int64index'
+ _typ = "int64index"
_can_hold_na = False
_engine_type = libindex.Int64Engine
_default_dtype = np.int64
@@ -211,19 +228,19 @@ class Int64Index(IntegerIndex):
@property
def inferred_type(self):
"""Always 'integer' for ``Int64Index``"""
- return 'integer'
+ return "integer"
@property
def asi8(self):
# do not cache or you'll create a memory leak
- return self.values.view('i8')
+ return self.values.view("i8")
- @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ @Appender(_index_shared_docs["_convert_scalar_indexer"])
def _convert_scalar_indexer(self, key, kind=None):
- assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+ assert kind in ["ix", "loc", "getitem", "iloc", None]
# don't coerce ilocs to integers
- if kind != 'iloc':
+ if kind != "iloc":
key = self._maybe_cast_indexer(key)
return super()._convert_scalar_indexer(key, kind=kind)
@@ -238,16 +255,12 @@ def _assert_safe_casting(cls, data, subarr):
"""
if not issubclass(data.dtype.type, np.signedinteger):
if not np.array_equal(data, subarr):
- raise TypeError('Unsafe NumPy casting, you must '
- 'explicitly cast')
+ raise TypeError("Unsafe NumPy casting, you must " "explicitly cast")
def _is_compatible_with_other(self, other):
- return (
- super()._is_compatible_with_other(other)
- or all(isinstance(type(obj), (ABCInt64Index,
- ABCFloat64Index,
- ABCRangeIndex))
- for obj in [self, other])
+ return super()._is_compatible_with_other(other) or all(
+ isinstance(type(obj), (ABCInt64Index, ABCFloat64Index, ABCRangeIndex))
+ for obj in [self, other]
)
@@ -255,17 +268,14 @@ def _is_compatible_with_other(self, other):
Int64Index._add_logical_methods()
_uint64_descr_args = dict(
- klass='UInt64Index',
- ltype='unsigned integer',
- dtype='uint64',
- extra=''
+ klass="UInt64Index", ltype="unsigned integer", dtype="uint64", extra=""
)
class UInt64Index(IntegerIndex):
- __doc__ = _num_index_shared_docs['class_descr'] % _uint64_descr_args
+ __doc__ = _num_index_shared_docs["class_descr"] % _uint64_descr_args
- _typ = 'uint64index'
+ _typ = "uint64index"
_can_hold_na = False
_engine_type = libindex.UInt64Engine
_default_dtype = np.uint64
@@ -273,23 +283,23 @@ class UInt64Index(IntegerIndex):
@property
def inferred_type(self):
"""Always 'integer' for ``UInt64Index``"""
- return 'integer'
+ return "integer"
@property
def asi8(self):
# do not cache or you'll create a memory leak
- return self.values.view('u8')
+ return self.values.view("u8")
- @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ @Appender(_index_shared_docs["_convert_scalar_indexer"])
def _convert_scalar_indexer(self, key, kind=None):
- assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+ assert kind in ["ix", "loc", "getitem", "iloc", None]
# don't coerce ilocs to integers
- if kind != 'iloc':
+ if kind != "iloc":
key = self._maybe_cast_indexer(key)
return super()._convert_scalar_indexer(key, kind=kind)
- @Appender(_index_shared_docs['_convert_arr_indexer'])
+ @Appender(_index_shared_docs["_convert_arr_indexer"])
def _convert_arr_indexer(self, keyarr):
# Cast the indexer to uint64 if possible so
# that the values returned from indexing are
@@ -299,7 +309,7 @@ def _convert_arr_indexer(self, keyarr):
return com.asarray_tuplesafe(keyarr, dtype=np.uint64)
return keyarr
- @Appender(_index_shared_docs['_convert_index_indexer'])
+ @Appender(_index_shared_docs["_convert_index_indexer"])
def _convert_index_indexer(self, keyarr):
# Cast the indexer to uint64 if possible so
# that the values returned from indexing are
@@ -319,15 +329,12 @@ def _assert_safe_casting(cls, data, subarr):
"""
if not issubclass(data.dtype.type, np.unsignedinteger):
if not np.array_equal(data, subarr):
- raise TypeError('Unsafe NumPy casting, you must '
- 'explicitly cast')
+ raise TypeError("Unsafe NumPy casting, you must " "explicitly cast")
def _is_compatible_with_other(self, other):
- return (
- super()._is_compatible_with_other(other)
- or all(isinstance(type(obj), (ABCUInt64Index,
- ABCFloat64Index))
- for obj in [self, other])
+ return super()._is_compatible_with_other(other) or all(
+ isinstance(type(obj), (ABCUInt64Index, ABCFloat64Index))
+ for obj in [self, other]
)
@@ -335,67 +342,73 @@ def _is_compatible_with_other(self, other):
UInt64Index._add_logical_methods()
_float64_descr_args = dict(
- klass='Float64Index',
- dtype='float64',
- ltype='float',
- extra=''
+ klass="Float64Index", dtype="float64", ltype="float", extra=""
)
class Float64Index(NumericIndex):
- __doc__ = _num_index_shared_docs['class_descr'] % _float64_descr_args
+ __doc__ = _num_index_shared_docs["class_descr"] % _float64_descr_args
- _typ = 'float64index'
+ _typ = "float64index"
_engine_type = libindex.Float64Engine
_default_dtype = np.float64
@property
def inferred_type(self):
"""Always 'floating' for ``Float64Index``"""
- return 'floating'
+ return "floating"
- @Appender(_index_shared_docs['astype'])
+ @Appender(_index_shared_docs["astype"])
def astype(self, dtype, copy=True):
dtype = pandas_dtype(dtype)
if needs_i8_conversion(dtype):
- msg = ('Cannot convert Float64Index to dtype {dtype}; integer '
- 'values are required for conversion').format(dtype=dtype)
+ msg = (
+ "Cannot convert Float64Index to dtype {dtype}; integer "
+ "values are required for conversion"
+ ).format(dtype=dtype)
raise TypeError(msg)
- elif (is_integer_dtype(dtype) and
- not is_extension_array_dtype(dtype)) and self.hasnans:
+ elif (
+ is_integer_dtype(dtype) and not is_extension_array_dtype(dtype)
+ ) and self.hasnans:
# TODO(jreback); this can change once we have an EA Index type
# GH 13149
- raise ValueError('Cannot convert NA to integer')
+ raise ValueError("Cannot convert NA to integer")
return super().astype(dtype, copy=copy)
- @Appender(_index_shared_docs['_convert_scalar_indexer'])
+ @Appender(_index_shared_docs["_convert_scalar_indexer"])
def _convert_scalar_indexer(self, key, kind=None):
- assert kind in ['ix', 'loc', 'getitem', 'iloc', None]
+ assert kind in ["ix", "loc", "getitem", "iloc", None]
- if kind == 'iloc':
- return self._validate_indexer('positional', key, kind)
+ if kind == "iloc":
+ return self._validate_indexer("positional", key, kind)
return key
- @Appender(_index_shared_docs['_convert_slice_indexer'])
+ @Appender(_index_shared_docs["_convert_slice_indexer"])
def _convert_slice_indexer(self, key, kind=None):
# if we are not a slice, then we are done
if not isinstance(key, slice):
return key
- if kind == 'iloc':
+ if kind == "iloc":
return super()._convert_slice_indexer(key, kind=kind)
# translate to locations
return self.slice_indexer(key.start, key.stop, key.step, kind=kind)
- def _format_native_types(self, na_rep='', float_format=None, decimal='.',
- quoting=None, **kwargs):
+ def _format_native_types(
+ self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs
+ ):
from pandas.io.formats.format import FloatArrayFormatter
- formatter = FloatArrayFormatter(self.values, na_rep=na_rep,
- float_format=float_format,
- decimal=decimal, quoting=quoting,
- fixed_width=False)
+
+ formatter = FloatArrayFormatter(
+ self.values,
+ na_rep=na_rep,
+ float_format=float_format,
+ decimal=decimal,
+ quoting=quoting,
+ fixed_width=False,
+ )
return formatter.get_result_as_array()
def get_value(self, series, key):
@@ -424,8 +437,7 @@ def equals(self, other):
try:
if not isinstance(other, Float64Index):
other = self._constructor(other)
- if (not is_dtype_equal(self.dtype, other.dtype) or
- self.shape != other.shape):
+ if not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape:
return False
left, right = self._ndarray_values, other._ndarray_values
return ((left == right) | (self._isnan & other._isnan)).all()
@@ -451,7 +463,7 @@ def __contains__(self, other):
return False
- @Appender(_index_shared_docs['get_loc'])
+ @Appender(_index_shared_docs["get_loc"])
def get_loc(self, key, method=None, tolerance=None):
try:
if np.all(np.isnan(key)) or is_bool(key):
@@ -477,13 +489,12 @@ def isin(self, values, level=None):
return algorithms.isin(np.array(self), values)
def _is_compatible_with_other(self, other):
- return (
- super()._is_compatible_with_other(other)
- or all(isinstance(type(obj), (ABCInt64Index,
- ABCFloat64Index,
- ABCUInt64Index,
- ABCRangeIndex))
- for obj in [self, other])
+ return super()._is_compatible_with_other(other) or all(
+ isinstance(
+ type(obj),
+ (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex),
+ )
+ for obj in [self, other]
)
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
index f61b2e679f0c8..0013df44614e8 100644
--- a/pandas/core/indexes/period.py
+++ b/pandas/core/indexes/period.py
@@ -4,26 +4,31 @@
import numpy as np
from pandas._libs import index as libindex
-from pandas._libs.tslibs import (
- NaT, frequencies as libfrequencies, iNaT, resolution)
-from pandas._libs.tslibs.period import (
- DIFFERENT_FREQ, IncompatibleFrequency, Period)
+from pandas._libs.tslibs import NaT, frequencies as libfrequencies, iNaT, resolution
+from pandas._libs.tslibs.period import DIFFERENT_FREQ, IncompatibleFrequency, Period
from pandas.util._decorators import Appender, Substitution, cache_readonly
from pandas.core.dtypes.common import (
- is_bool_dtype, is_datetime64_any_dtype, is_float, is_float_dtype,
- is_integer, is_integer_dtype, pandas_dtype)
+ is_bool_dtype,
+ is_datetime64_any_dtype,
+ is_float,
+ is_float_dtype,
+ is_integer,
+ is_integer_dtype,
+ pandas_dtype,
+)
from pandas.core import common as com
from pandas.core.accessor import delegate_names
from pandas.core.algorithms import unique1d
-from pandas.core.arrays.period import (
- PeriodArray, period_array, validate_dtype_freq)
+from pandas.core.arrays.period import PeriodArray, period_array, validate_dtype_freq
from pandas.core.base import _shared_docs
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import _index_shared_docs, ensure_index
from pandas.core.indexes.datetimelike import (
- DatetimeIndexOpsMixin, DatetimelikeDelegateMixin)
+ DatetimeIndexOpsMixin,
+ DatetimelikeDelegateMixin,
+)
from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index
from pandas.core.missing import isna
from pandas.core.ops import get_op_result_name
@@ -33,8 +38,7 @@
from pandas.tseries.offsets import DateOffset, Tick
_index_doc_kwargs = dict(ibase._index_doc_kwargs)
-_index_doc_kwargs.update(
- dict(target_klass='PeriodIndex or list of Periods'))
+_index_doc_kwargs.update(dict(target_klass="PeriodIndex or list of Periods"))
# --- Period index sketch
@@ -42,9 +46,9 @@
def _new_PeriodIndex(cls, **d):
# GH13277 for unpickling
- values = d.pop('data')
- if values.dtype == 'int64':
- freq = d.pop('freq', None)
+ values = d.pop("data")
+ if values.dtype == "int64":
+ freq = d.pop("freq", None)
values = PeriodArray(values, freq=freq)
return cls._simple_new(values, **d)
else:
@@ -55,21 +59,17 @@ class PeriodDelegateMixin(DatetimelikeDelegateMixin):
"""
Delegate from PeriodIndex to PeriodArray.
"""
+
_delegate_class = PeriodArray
_delegated_properties = PeriodArray._datetimelike_ops
- _delegated_methods = (
- set(PeriodArray._datetimelike_methods) | {'_addsub_int_array'}
- )
- _raw_properties = {'is_leap_year'}
-
-
-@delegate_names(PeriodArray,
- PeriodDelegateMixin._delegated_properties,
- typ='property')
-@delegate_names(PeriodArray,
- PeriodDelegateMixin._delegated_methods,
- typ="method",
- overwrite=True)
+ _delegated_methods = set(PeriodArray._datetimelike_methods) | {"_addsub_int_array"}
+ _raw_properties = {"is_leap_year"}
+
+
+@delegate_names(PeriodArray, PeriodDelegateMixin._delegated_properties, typ="property")
+@delegate_names(
+ PeriodArray, PeriodDelegateMixin._delegated_methods, typ="method", overwrite=True
+)
class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin):
"""
Immutable ndarray holding ordinal values indicating regular periods in
@@ -161,8 +161,9 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin):
--------
>>> idx = pd.PeriodIndex(year=year_arr, quarter=q_arr)
"""
- _typ = 'periodindex'
- _attributes = ['name', 'freq']
+
+ _typ = "periodindex"
+ _attributes = ["name", "freq"]
# define my properties & methods for delegation
_is_numeric_dtype = False
@@ -175,39 +176,59 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin):
# ------------------------------------------------------------------------
# Index Constructors
- def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None,
- periods=None, tz=None, dtype=None, copy=False, name=None,
- **fields):
-
- valid_field_set = {'year', 'month', 'day', 'quarter',
- 'hour', 'minute', 'second'}
+ def __new__(
+ cls,
+ data=None,
+ ordinal=None,
+ freq=None,
+ start=None,
+ end=None,
+ periods=None,
+ tz=None,
+ dtype=None,
+ copy=False,
+ name=None,
+ **fields
+ ):
+
+ valid_field_set = {
+ "year",
+ "month",
+ "day",
+ "quarter",
+ "hour",
+ "minute",
+ "second",
+ }
if not set(fields).issubset(valid_field_set):
- raise TypeError('__new__() got an unexpected keyword argument {}'.
- format(list(set(fields) - valid_field_set)[0]))
+ raise TypeError(
+ "__new__() got an unexpected keyword argument {}".format(
+ list(set(fields) - valid_field_set)[0]
+ )
+ )
- if name is None and hasattr(data, 'name'):
+ if name is None and hasattr(data, "name"):
name = data.name
if data is None and ordinal is None:
# range-based.
- data, freq2 = PeriodArray._generate_range(start, end, periods,
- freq, fields)
+ data, freq2 = PeriodArray._generate_range(start, end, periods, freq, fields)
# PeriodArray._generate range does validate that fields is
# empty when really using the range-based constructor.
if not fields:
- msg = ("Creating a PeriodIndex by passing range "
- "endpoints is deprecated. Use "
- "`pandas.period_range` instead.")
+ msg = (
+ "Creating a PeriodIndex by passing range "
+ "endpoints is deprecated. Use "
+ "`pandas.period_range` instead."
+ )
# period_range differs from PeriodIndex for cases like
# start="2000", periods=4
# PeriodIndex interprets that as A-DEC freq.
# period_range interprets it as 'D' freq.
- cond = (
- freq is None and (
- (start and not isinstance(start, Period)) or
- (end and not isinstance(end, Period))
- )
+ cond = freq is None and (
+ (start and not isinstance(start, Period))
+ or (end and not isinstance(end, Period))
)
if cond:
msg += (
@@ -291,11 +312,12 @@ def freq(self, value):
value = Period._maybe_convert_freq(value)
# TODO: When this deprecation is enforced, PeriodIndex.freq can
# be removed entirely, and we'll just inherit.
- msg = ('Setting {cls}.freq has been deprecated and will be '
- 'removed in a future version; use {cls}.asfreq instead. '
- 'The {cls}.freq setter is not guaranteed to work.')
- warnings.warn(msg.format(cls=type(self).__name__),
- FutureWarning, stacklevel=2)
+ msg = (
+ "Setting {cls}.freq has been deprecated and will be "
+ "removed in a future version; use {cls}.asfreq instead. "
+ "The {cls}.freq setter is not guaranteed to work."
+ )
+ warnings.warn(msg.format(cls=type(self).__name__), FutureWarning, stacklevel=2)
# PeriodArray._freq isn't actually mutable. We set the private _freq
# here, but people shouldn't be doing this anyway.
self._data._freq = value
@@ -309,8 +331,7 @@ def _shallow_copy(self, values=None, **kwargs):
values = values._values
if not isinstance(values, PeriodArray):
- if (isinstance(values, np.ndarray) and
- is_integer_dtype(values.dtype)):
+ if isinstance(values, np.ndarray) and is_integer_dtype(values.dtype):
values = PeriodArray(values, freq=self.freq)
else:
# in particular, I would like to avoid period_array here.
@@ -322,12 +343,12 @@ def _shallow_copy(self, values=None, **kwargs):
values = period_array(values, freq=self.freq)
# We don't allow changing `freq` in _shallow_copy.
- validate_dtype_freq(self.dtype, kwargs.get('freq'))
+ validate_dtype_freq(self.dtype, kwargs.get("freq"))
attributes = self._get_attributes_dict()
attributes.update(kwargs)
- if not len(values) and 'dtype' not in kwargs:
- attributes['dtype'] = self.dtype
+ if not len(values) and "dtype" not in kwargs:
+ attributes["dtype"] = self.dtype
return self._simple_new(values, **attributes)
def _shallow_copy_with_infer(self, values=None, **kwargs):
@@ -347,6 +368,7 @@ def func(x):
return x
else:
return Period._from_ordinal(ordinal=x, freq=self.freq)
+
return func
def _maybe_convert_timedelta(self, other):
@@ -366,8 +388,7 @@ def _maybe_convert_timedelta(self, other):
IncompatibleFrequency : if the input cannot be written as a multiple
of self.freq. Note IncompatibleFrequency subclasses ValueError.
"""
- if isinstance(
- other, (timedelta, np.timedelta64, Tick, np.ndarray)):
+ if isinstance(other, (timedelta, np.timedelta64, Tick, np.ndarray)):
offset = frequencies.to_offset(self.freq.rule_code)
if isinstance(offset, Tick):
# _check_timedeltalike_freq_compat will raise if incompatible
@@ -379,9 +400,9 @@ def _maybe_convert_timedelta(self, other):
if base == self.freq.rule_code:
return other.n
- msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
- own_freq=self.freqstr,
- other_freq=other.freqstr)
+ msg = DIFFERENT_FREQ.format(
+ cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr
+ )
raise IncompatibleFrequency(msg)
elif is_integer(other):
# integer is passed to .shift via
@@ -390,19 +411,17 @@ def _maybe_convert_timedelta(self, other):
return other
# raise when input doesn't have freq
- msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
- own_freq=self.freqstr,
- other_freq=None)
+ msg = DIFFERENT_FREQ.format(
+ cls=type(self).__name__, own_freq=self.freqstr, other_freq=None
+ )
raise IncompatibleFrequency(msg)
# ------------------------------------------------------------------------
# Rendering Methods
- def _format_native_types(self, na_rep='NaT', quoting=None, **kwargs):
+ def _format_native_types(self, na_rep="NaT", quoting=None, **kwargs):
# just dispatch, return ndarray
- return self._data._format_native_types(na_rep=na_rep,
- quoting=quoting,
- **kwargs)
+ return self._data._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs)
def _mpl_repr(self):
# how to represent ourselves to matplotlib
@@ -419,7 +438,7 @@ def _formatter_func(self):
def _engine(self):
return self._engine_type(lambda: self, len(self))
- @Appender(_index_shared_docs['contains'])
+ @Appender(_index_shared_docs["contains"])
def __contains__(self, key):
if isinstance(key, Period):
if key.freq != self.freq:
@@ -471,14 +490,13 @@ def __array_wrap__(self, result, context=None):
name = self.name
left = context[1][0]
right = context[1][1]
- if (isinstance(left, PeriodIndex) and
- isinstance(right, PeriodIndex)):
+ if isinstance(left, PeriodIndex) and isinstance(right, PeriodIndex):
name = left.name if left.name == right.name else None
return Index(result, name=name)
elif isinstance(left, Period) or isinstance(right, Period):
return Index(result, name=name)
elif isinstance(func, np.ufunc):
- if 'M->M' not in func.types:
+ if "M->M" not in func.types:
msg = "ufunc '{0}' not supported for the PeriodIndex"
# This should be TypeError, but TypeError cannot be raised
# from here because numpy catches.
@@ -501,37 +519,41 @@ def asof_locs(self, where, mask):
where_idx = PeriodIndex(where_idx.values, freq=self.freq)
locs = self._ndarray_values[mask].searchsorted(
- where_idx._ndarray_values, side='right')
+ where_idx._ndarray_values, side="right"
+ )
locs = np.where(locs > 0, locs - 1, 0)
result = np.arange(len(self))[mask].take(locs)
first = mask.argmax()
- result[(locs == 0) & (where_idx._ndarray_values <
- self._ndarray_values[first])] = -1
+ result[
+ (locs == 0) & (where_idx._ndarray_values < self._ndarray_values[first])
+ ] = -1
return result
- @Appender(_index_shared_docs['astype'])
- def astype(self, dtype, copy=True, how='start'):
+ @Appender(_index_shared_docs["astype"])
+ def astype(self, dtype, copy=True, how="start"):
dtype = pandas_dtype(dtype)
if is_datetime64_any_dtype(dtype):
# 'how' is index-specific, isn't part of the EA interface.
- tz = getattr(dtype, 'tz', None)
+ tz = getattr(dtype, "tz", None)
return self.to_timestamp(how=how).tz_localize(tz)
# TODO: should probably raise on `how` here, so we don't ignore it.
return super().astype(dtype, copy=copy)
- @Substitution(klass='PeriodIndex')
- @Appender(_shared_docs['searchsorted'])
- def searchsorted(self, value, side='left', sorter=None):
+ @Substitution(klass="PeriodIndex")
+ @Appender(_shared_docs["searchsorted"])
+ def searchsorted(self, value, side="left", sorter=None):
if isinstance(value, Period):
if value.freq != self.freq:
- msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
- own_freq=self.freqstr,
- other_freq=value.freqstr)
+ msg = DIFFERENT_FREQ.format(
+ cls=type(self).__name__,
+ own_freq=self.freqstr,
+ other_freq=value.freqstr,
+ )
raise IncompatibleFrequency(msg)
value = value.ordinal
elif isinstance(value, str):
@@ -540,8 +562,7 @@ def searchsorted(self, value, side='left', sorter=None):
except DateParseError:
raise KeyError("Cannot interpret '{}' as period".format(value))
- return self._ndarray_values.searchsorted(value, side=side,
- sorter=sorter)
+ return self._ndarray_values.searchsorted(value, side=side, sorter=sorter)
@property
def is_all_dates(self):
@@ -556,7 +577,7 @@ def is_full(self):
if len(self) == 0:
return True
if not self.is_monotonic:
- raise ValueError('Index is not monotonic')
+ raise ValueError("Index is not monotonic")
values = self.asi8
return ((values[1:] - values[:-1]) < 2).all()
@@ -564,7 +585,7 @@ def is_full(self):
def inferred_type(self):
# b/c data is represented as ints make sure we can't have ambiguous
# indexing
- return 'period'
+ return "period"
def get_value(self, series, key):
"""
@@ -573,9 +594,7 @@ def get_value(self, series, key):
"""
s = com.values_from_object(series)
try:
- return com.maybe_box(self,
- super().get_value(s, key),
- series, key)
+ return com.maybe_box(self, super().get_value(s, key), series, key)
except (KeyError, IndexError):
try:
asdt, parsed, reso = parse_time_string(key, self.freq)
@@ -587,8 +606,8 @@ def get_value(self, series, key):
# if our data is higher resolution than requested key, slice
if grp < freqn:
iv = Period(asdt, freq=(grp, 1))
- ord1 = iv.asfreq(self.freq, how='S').ordinal
- ord2 = iv.asfreq(self.freq, how='E').ordinal
+ ord1 = iv.asfreq(self.freq, how="S").ordinal
+ ord2 = iv.asfreq(self.freq, how="E").ordinal
if ord2 < vals[0] or ord1 > vals[-1]:
raise KeyError(key)
@@ -598,8 +617,9 @@ def get_value(self, series, key):
return series[key]
elif grp == freqn:
key = Period(asdt, freq=self.freq).ordinal
- return com.maybe_box(self, self._engine.get_value(s, key),
- series, key)
+ return com.maybe_box(
+ self, self._engine.get_value(s, key), series, key
+ )
else:
raise KeyError(key)
except TypeError:
@@ -607,17 +627,18 @@ def get_value(self, series, key):
period = Period(key, self.freq)
key = period.value if isna(period) else period.ordinal
- return com.maybe_box(self, self._engine.get_value(s, key),
- series, key)
+ return com.maybe_box(self, self._engine.get_value(s, key), series, key)
- @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs)
+ @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
def get_indexer(self, target, method=None, limit=None, tolerance=None):
target = ensure_index(target)
- if hasattr(target, 'freq') and target.freq != self.freq:
- msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
- own_freq=self.freqstr,
- other_freq=target.freqstr)
+ if hasattr(target, "freq") and target.freq != self.freq:
+ msg = DIFFERENT_FREQ.format(
+ cls=type(self).__name__,
+ own_freq=self.freqstr,
+ other_freq=target.freqstr,
+ )
raise IncompatibleFrequency(msg)
if isinstance(target, PeriodIndex):
@@ -625,8 +646,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
if tolerance is not None:
tolerance = self._convert_tolerance(tolerance, target)
- return Index.get_indexer(self._int64index, target, method,
- limit, tolerance)
+ return Index.get_indexer(self._int64index, target, method, limit, tolerance)
def _get_unique_index(self, dropna=False):
"""
@@ -682,8 +702,7 @@ def get_loc(self, key, method=None, tolerance=None):
try:
ordinal = iNaT if key is NaT else key.ordinal
if tolerance is not None:
- tolerance = self._convert_tolerance(tolerance,
- np.asarray(key))
+ tolerance = self._convert_tolerance(tolerance, np.asarray(key))
return self._int64index.get_loc(ordinal, method, tolerance)
except KeyError:
@@ -709,7 +728,7 @@ def _maybe_cast_slice_bound(self, label, side, kind):
Value of `side` parameter should be validated in caller.
"""
- assert kind in ['ix', 'loc', 'getitem']
+ assert kind in ["ix", "loc", "getitem"]
if isinstance(label, datetime):
return Period(label, freq=self.freq)
@@ -717,86 +736,105 @@ def _maybe_cast_slice_bound(self, label, side, kind):
try:
_, parsed, reso = parse_time_string(label, self.freq)
bounds = self._parsed_string_to_bounds(reso, parsed)
- return bounds[0 if side == 'left' else 1]
+ return bounds[0 if side == "left" else 1]
except Exception:
raise KeyError(label)
elif is_integer(label) or is_float(label):
- self._invalid_indexer('slice', label)
+ self._invalid_indexer("slice", label)
return label
def _parsed_string_to_bounds(self, reso, parsed):
- if reso == 'year':
- t1 = Period(year=parsed.year, freq='A')
- elif reso == 'month':
- t1 = Period(year=parsed.year, month=parsed.month, freq='M')
- elif reso == 'quarter':
+ if reso == "year":
+ t1 = Period(year=parsed.year, freq="A")
+ elif reso == "month":
+ t1 = Period(year=parsed.year, month=parsed.month, freq="M")
+ elif reso == "quarter":
q = (parsed.month - 1) // 3 + 1
- t1 = Period(year=parsed.year, quarter=q, freq='Q-DEC')
- elif reso == 'day':
- t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day,
- freq='D')
- elif reso == 'hour':
- t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day,
- hour=parsed.hour, freq='H')
- elif reso == 'minute':
- t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day,
- hour=parsed.hour, minute=parsed.minute, freq='T')
- elif reso == 'second':
- t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day,
- hour=parsed.hour, minute=parsed.minute,
- second=parsed.second, freq='S')
+ t1 = Period(year=parsed.year, quarter=q, freq="Q-DEC")
+ elif reso == "day":
+ t1 = Period(year=parsed.year, month=parsed.month, day=parsed.day, freq="D")
+ elif reso == "hour":
+ t1 = Period(
+ year=parsed.year,
+ month=parsed.month,
+ day=parsed.day,
+ hour=parsed.hour,
+ freq="H",
+ )
+ elif reso == "minute":
+ t1 = Period(
+ year=parsed.year,
+ month=parsed.month,
+ day=parsed.day,
+ hour=parsed.hour,
+ minute=parsed.minute,
+ freq="T",
+ )
+ elif reso == "second":
+ t1 = Period(
+ year=parsed.year,
+ month=parsed.month,
+ day=parsed.day,
+ hour=parsed.hour,
+ minute=parsed.minute,
+ second=parsed.second,
+ freq="S",
+ )
else:
raise KeyError(reso)
- return (t1.asfreq(self.freq, how='start'),
- t1.asfreq(self.freq, how='end'))
+ return (t1.asfreq(self.freq, how="start"), t1.asfreq(self.freq, how="end"))
def _get_string_slice(self, key):
if not self.is_monotonic:
- raise ValueError('Partial indexing only valid for '
- 'ordered time series')
+ raise ValueError("Partial indexing only valid for " "ordered time series")
key, parsed, reso = parse_time_string(key, self.freq)
grp = resolution.Resolution.get_freq_group(reso)
freqn = resolution.get_freq_group(self.freq)
- if reso in ['day', 'hour', 'minute', 'second'] and not grp < freqn:
+ if reso in ["day", "hour", "minute", "second"] and not grp < freqn:
raise KeyError(key)
t1, t2 = self._parsed_string_to_bounds(reso, parsed)
- return slice(self.searchsorted(t1.ordinal, side='left'),
- self.searchsorted(t2.ordinal, side='right'))
+ return slice(
+ self.searchsorted(t1.ordinal, side="left"),
+ self.searchsorted(t2.ordinal, side="right"),
+ )
def _convert_tolerance(self, tolerance, target):
- tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance,
- target)
+ tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance, target)
if target.size != tolerance.size and tolerance.size > 1:
- raise ValueError('list-like tolerance size must match '
- 'target index size')
+ raise ValueError("list-like tolerance size must match " "target index size")
return self._maybe_convert_timedelta(tolerance)
def insert(self, loc, item):
if not isinstance(item, Period) or self.freq != item.freq:
return self.astype(object).insert(loc, item)
- idx = np.concatenate((self[:loc].asi8, np.array([item.ordinal]),
- self[loc:].asi8))
+ idx = np.concatenate(
+ (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8)
+ )
return self._shallow_copy(idx)
- def join(self, other, how='left', level=None, return_indexers=False,
- sort=False):
+ def join(self, other, how="left", level=None, return_indexers=False, sort=False):
"""
See Index.join
"""
self._assert_can_do_setop(other)
if not isinstance(other, PeriodIndex):
- return self.astype(object).join(other, how=how, level=level,
- return_indexers=return_indexers,
- sort=sort)
-
- result = Int64Index.join(self, other, how=how, level=level,
- return_indexers=return_indexers,
- sort=sort)
+ return self.astype(object).join(
+ other, how=how, level=level, return_indexers=return_indexers, sort=sort
+ )
+
+ result = Int64Index.join(
+ self,
+ other,
+ how=how,
+ level=level,
+ return_indexers=return_indexers,
+ sort=sort,
+ )
if return_indexers:
result, lidx, ridx = result
@@ -813,9 +851,9 @@ def _assert_can_do_setop(self, other):
# *Can't* use PeriodIndexes of different freqs
# *Can* use PeriodIndex/DatetimeIndex
if isinstance(other, PeriodIndex) and self.freq != other.freq:
- msg = DIFFERENT_FREQ.format(cls=type(self).__name__,
- own_freq=self.freqstr,
- other_freq=other.freqstr)
+ msg = DIFFERENT_FREQ.format(
+ cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr
+ )
raise IncompatibleFrequency(msg)
def _wrap_setop_result(self, other, result):
@@ -826,8 +864,7 @@ def _wrap_setop_result(self, other, result):
def _apply_meta(self, rawarr):
if not isinstance(rawarr, PeriodIndex):
- rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq,
- name=self.name)
+ rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, name=self.name)
return rawarr
def __setstate__(self, state):
@@ -863,9 +900,12 @@ def __setstate__(self, state):
@property
def flags(self):
""" return the ndarray.flags for the underlying data """
- warnings.warn("{obj}.flags is deprecated and will be removed "
- "in a future version".format(obj=type(self).__name__),
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "{obj}.flags is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning,
+ stacklevel=2,
+ )
return self._ndarray_values.flags
def item(self):
@@ -876,22 +916,29 @@ def item(self):
.. deprecated 0.25.0
"""
- warnings.warn('`item` has been deprecated and will be removed in a '
- 'future version', FutureWarning, stacklevel=2)
+ warnings.warn(
+ "`item` has been deprecated and will be removed in a " "future version",
+ FutureWarning,
+ stacklevel=2,
+ )
# TODO(DatetimeArray): remove
if len(self) == 1:
return self[0]
else:
# copy numpy's message here because Py26 raises an IndexError
- raise ValueError('can only convert an array of size 1 to a '
- 'Python scalar')
+ raise ValueError(
+ "can only convert an array of size 1 to a " "Python scalar"
+ )
@property
def data(self):
""" return the data pointer of the underlying data """
- warnings.warn("{obj}.data is deprecated and will be removed "
- "in a future version".format(obj=type(self).__name__),
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "{obj}.data is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning,
+ stacklevel=2,
+ )
return np.asarray(self._data).data
@property
@@ -899,9 +946,12 @@ def base(self):
""" return the base object if the memory of the underlying data is
shared
"""
- warnings.warn("{obj}.base is deprecated and will be removed "
- "in a future version".format(obj=type(self).__name__),
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "{obj}.base is deprecated and will be removed "
+ "in a future version".format(obj=type(self).__name__),
+ FutureWarning,
+ stacklevel=2,
+ )
return np.asarray(self._data)
@@ -963,13 +1013,13 @@ def period_range(start=None, end=None, periods=None, freq=None, name=None):
dtype='period[M]', freq='M')
"""
if com.count_not_none(start, end, periods) != 2:
- raise ValueError('Of the three parameters: start, end, and periods, '
- 'exactly two must be specified')
- if freq is None and (not isinstance(start, Period)
- and not isinstance(end, Period)):
- freq = 'D'
-
- data, freq = PeriodArray._generate_range(start, end, periods, freq,
- fields={})
+ raise ValueError(
+ "Of the three parameters: start, end, and periods, "
+ "exactly two must be specified"
+ )
+ if freq is None and (not isinstance(start, Period) and not isinstance(end, Period)):
+ freq = "D"
+
+ data, freq = PeriodArray._generate_range(start, end, periods, freq, fields={})
data = PeriodArray(data, freq=freq)
return PeriodIndex(data, name=name)
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
index 70ca0b349e7ed..16098c474a473 100644
--- a/pandas/core/indexes/range.py
+++ b/pandas/core/indexes/range.py
@@ -13,10 +13,16 @@
from pandas.core.dtypes import concat as _concat
from pandas.core.dtypes.common import (
- ensure_platform_int, ensure_python_int, is_int64_dtype, is_integer,
- is_integer_dtype, is_list_like, is_scalar, is_timedelta64_dtype)
-from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCSeries, ABCTimedeltaIndex)
+ ensure_platform_int,
+ ensure_python_int,
+ is_int64_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_list_like,
+ is_scalar,
+ is_timedelta64_dtype,
+)
+from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCTimedeltaIndex
from pandas.core import ops
import pandas.core.common as com
@@ -65,7 +71,7 @@ class RangeIndex(Int64Index):
Int64Index : Index of int64 data.
"""
- _typ = 'rangeindex'
+ _typ = "rangeindex"
_engine_type = libindex.Int64Engine
_range = None # type: range
@@ -74,13 +80,24 @@ class RangeIndex(Int64Index):
# --------------------------------------------------------------------
# Constructors
- def __new__(cls, start=None, stop=None, step=None,
- dtype=None, copy=False, name=None, fastpath=None):
+ def __new__(
+ cls,
+ start=None,
+ stop=None,
+ step=None,
+ dtype=None,
+ copy=False,
+ name=None,
+ fastpath=None,
+ ):
if fastpath is not None:
- warnings.warn("The 'fastpath' keyword is deprecated, and will be "
- "removed in a future version.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "The 'fastpath' keyword is deprecated, and will be "
+ "removed in a future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
if fastpath:
return cls._simple_new(range(start, stop, step), name=name)
@@ -121,8 +138,9 @@ def from_range(cls, data, name=None, dtype=None):
"""
if not isinstance(data, range):
raise TypeError(
- '{0}(...) must be called with object coercible to a '
- 'range, {1} was passed'.format(cls.__name__, repr(data)))
+ "{0}(...) must be called with object coercible to a "
+ "range, {1} was passed".format(cls.__name__, repr(data))
+ )
cls._validate_dtype(dtype)
return cls._simple_new(data, dtype=dtype, name=name)
@@ -153,7 +171,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
def _validate_dtype(dtype):
""" require dtype to be None or int64 """
if not (dtype is None or is_int64_dtype(dtype)):
- raise TypeError('Invalid to pass a non-int64 dtype to RangeIndex')
+ raise TypeError("Invalid to pass a non-int64 dtype to RangeIndex")
@cache_readonly
def _constructor(self):
@@ -170,8 +188,9 @@ def _data(self):
triggering the construction.
"""
if self._cached_data is None:
- self._cached_data = np.arange(self.start, self.stop, self.step,
- dtype=np.int64)
+ self._cached_data = np.arange(
+ self.start, self.stop, self.step, dtype=np.int64
+ )
return self._cached_data
@cache_readonly
@@ -181,9 +200,7 @@ def _int64index(self):
def _get_data_as_items(self):
""" return a list of tuples of start, stop, step """
rng = self._range
- return [('start', rng.start),
- ('stop', rng.stop),
- ('step', rng.step)]
+ return [("start", rng.start), ("stop", rng.stop), ("step", rng.step)]
def __reduce__(self):
d = self._get_attributes_dict()
@@ -199,20 +216,22 @@ def _format_attrs(self):
"""
attrs = self._get_data_as_items()
if self.name is not None:
- attrs.append(('name', ibase.default_pprint(self.name)))
+ attrs.append(("name", ibase.default_pprint(self.name)))
return attrs
def _format_data(self, name=None):
# we are formatting thru the attributes
return None
- def _format_with_header(self, header, na_rep='NaN', **kwargs):
+ def _format_with_header(self, header, na_rep="NaN", **kwargs):
return header + list(map(pprint_thing, self._range))
# --------------------------------------------------------------------
- _deprecation_message = ("RangeIndex.{} is deprecated and will be "
- "removed in a future version. Use RangeIndex.{} "
- "instead")
+ _deprecation_message = (
+ "RangeIndex.{} is deprecated and will be "
+ "removed in a future version. Use RangeIndex.{} "
+ "instead"
+ )
@cache_readonly
def start(self):
@@ -230,8 +249,11 @@ def _start(self):
.. deprecated:: 0.25.0
Use ``start`` instead.
"""
- warnings.warn(self._deprecation_message.format("_start", "start"),
- DeprecationWarning, stacklevel=2)
+ warnings.warn(
+ self._deprecation_message.format("_start", "start"),
+ DeprecationWarning,
+ stacklevel=2,
+ )
return self.start
@cache_readonly
@@ -250,8 +272,11 @@ def _stop(self):
Use ``stop`` instead.
"""
# GH 25710
- warnings.warn(self._deprecation_message.format("_stop", "stop"),
- DeprecationWarning, stacklevel=2)
+ warnings.warn(
+ self._deprecation_message.format("_stop", "stop"),
+ DeprecationWarning,
+ stacklevel=2,
+ )
return self.stop
@cache_readonly
@@ -271,8 +296,11 @@ def _step(self):
Use ``step`` instead.
"""
# GH 25710
- warnings.warn(self._deprecation_message.format("_step", "step"),
- DeprecationWarning, stacklevel=2)
+ warnings.warn(
+ self._deprecation_message.format("_step", "step"),
+ DeprecationWarning,
+ stacklevel=2,
+ )
return self.step
@cache_readonly
@@ -281,8 +309,10 @@ def nbytes(self):
Return the number of bytes in the underlying data.
"""
rng = self._range
- return getsizeof(rng) + sum(getsizeof(getattr(rng, attr_name))
- for attr_name in ['start', 'stop', 'step'])
+ return getsizeof(rng) + sum(
+ getsizeof(getattr(rng, attr_name))
+ for attr_name in ["start", "stop", "step"]
+ )
def memory_usage(self, deep=False):
"""
@@ -338,7 +368,7 @@ def __contains__(self, key: Union[int, np.integer]) -> bool:
return False
return key in self._range
- @Appender(_index_shared_docs['get_loc'])
+ @Appender(_index_shared_docs["get_loc"])
def get_loc(self, key, method=None, tolerance=None):
if is_integer(key) and method is None and tolerance is None:
new_key = int(key)
@@ -348,25 +378,21 @@ def get_loc(self, key, method=None, tolerance=None):
raise KeyError(key)
return super().get_loc(key, method=method, tolerance=tolerance)
- @Appender(_index_shared_docs['get_indexer'])
+ @Appender(_index_shared_docs["get_indexer"])
def get_indexer(self, target, method=None, limit=None, tolerance=None):
if not (method is None and tolerance is None and is_list_like(target)):
- return super().get_indexer(target, method=method,
- tolerance=tolerance)
+ return super().get_indexer(target, method=method, tolerance=tolerance)
if self.step > 0:
start, stop, step = self.start, self.stop, self.step
else:
# Work on reversed range for simplicity:
- start, stop, step = (self.stop - self.step,
- self.start + 1,
- - self.step)
+ start, stop, step = (self.stop - self.step, self.start + 1, -self.step)
target_array = np.asarray(target)
if not (is_integer_dtype(target_array) and target_array.ndim == 1):
# checks/conversions/roundings are delegated to general method
- return super().get_indexer(target, method=method,
- tolerance=tolerance)
+ return super().get_indexer(target, method=method, tolerance=tolerance)
locs = target_array - start
valid = (locs % step == 0) & (locs >= 0) & (target_array < stop)
@@ -381,16 +407,16 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
def tolist(self):
return list(self._range)
- @Appender(_index_shared_docs['_shallow_copy'])
+ @Appender(_index_shared_docs["_shallow_copy"])
def _shallow_copy(self, values=None, **kwargs):
if values is None:
name = kwargs.get("name", self.name)
return self._simple_new(self._range, name=name)
else:
- kwargs.setdefault('name', self.name)
+ kwargs.setdefault("name", self.name)
return self._int64index._shallow_copy(values, **kwargs)
- @Appender(ibase._index_shared_docs['copy'])
+ @Appender(ibase._index_shared_docs["copy"])
def copy(self, name=None, deep=False, dtype=None, **kwargs):
self._validate_dtype(dtype)
if name is None:
@@ -401,8 +427,7 @@ def _minmax(self, meth):
no_steps = len(self) - 1
if no_steps == -1:
return np.nan
- elif ((meth == 'min' and self.step > 0) or
- (meth == 'max' and self.step < 0)):
+ elif (meth == "min" and self.step > 0) or (meth == "max" and self.step < 0):
return self.start
return self.start + self.step * no_steps
@@ -411,13 +436,13 @@ def min(self, axis=None, skipna=True, *args, **kwargs):
"""The minimum value of the RangeIndex"""
nv.validate_minmax_axis(axis)
nv.validate_min(args, kwargs)
- return self._minmax('min')
+ return self._minmax("min")
def max(self, axis=None, skipna=True, *args, **kwargs):
"""The maximum value of the RangeIndex"""
nv.validate_minmax_axis(axis)
nv.validate_max(args, kwargs)
- return self._minmax('max')
+ return self._minmax("max")
def argsort(self, *args, **kwargs):
"""
@@ -501,8 +526,7 @@ def intersection(self, other, sort=False):
# calculate parameters for the RangeIndex describing the
# intersection disregarding the lower bounds
- tmp_start = first.start + (second.start - first.start) * \
- first.step // gcd * s
+ tmp_start = first.start + (second.start - first.start) * first.step // gcd * s
new_step = first.step * second.step // gcd
new_range = range(tmp_start, int_high, new_step)
new_index = self._simple_new(new_range)
@@ -586,35 +610,39 @@ def _union(self, other, sort):
start_r = min(start_s, start_o)
end_r = max(end_s, end_o)
if step_o == step_s:
- if ((start_s - start_o) % step_s == 0 and
- (start_s - end_o) <= step_s and
- (start_o - end_s) <= step_s):
+ if (
+ (start_s - start_o) % step_s == 0
+ and (start_s - end_o) <= step_s
+ and (start_o - end_s) <= step_s
+ ):
return self.__class__(start_r, end_r + step_s, step_s)
- if ((step_s % 2 == 0) and
- (abs(start_s - start_o) <= step_s / 2) and
- (abs(end_s - end_o) <= step_s / 2)):
- return self.__class__(start_r,
- end_r + step_s / 2,
- step_s / 2)
+ if (
+ (step_s % 2 == 0)
+ and (abs(start_s - start_o) <= step_s / 2)
+ and (abs(end_s - end_o) <= step_s / 2)
+ ):
+ return self.__class__(start_r, end_r + step_s / 2, step_s / 2)
elif step_o % step_s == 0:
- if ((start_o - start_s) % step_s == 0 and
- (start_o + step_s >= start_s) and
- (end_o - step_s <= end_s)):
+ if (
+ (start_o - start_s) % step_s == 0
+ and (start_o + step_s >= start_s)
+ and (end_o - step_s <= end_s)
+ ):
return self.__class__(start_r, end_r + step_s, step_s)
elif step_s % step_o == 0:
- if ((start_s - start_o) % step_o == 0 and
- (start_s + step_o >= start_o) and
- (end_s - step_o <= end_o)):
+ if (
+ (start_s - start_o) % step_o == 0
+ and (start_s + step_o >= start_o)
+ and (end_s - step_o <= end_o)
+ ):
return self.__class__(start_r, end_r + step_o, step_o)
return self._int64index._union(other, sort=sort)
- @Appender(_index_shared_docs['join'])
- def join(self, other, how='left', level=None, return_indexers=False,
- sort=False):
- if how == 'outer' and self is not other:
+ @Appender(_index_shared_docs["join"])
+ def join(self, other, how="left", level=None, return_indexers=False, sort=False):
+ if how == "outer" and self is not other:
# note: could return RangeIndex in more circumstances
- return self._int64index.join(other, how, level, return_indexers,
- sort)
+ return self._int64index.join(other, how, level, return_indexers, sort)
return super().join(other, how, level, return_indexers, sort)
@@ -643,14 +671,17 @@ def __getitem__(self, key):
try:
return self._range[new_key]
except IndexError:
- raise IndexError("index {key} is out of bounds for axis 0 "
- "with size {size}".format(key=key,
- size=len(self)))
+ raise IndexError(
+ "index {key} is out of bounds for axis 0 "
+ "with size {size}".format(key=key, size=len(self))
+ )
elif is_scalar(key):
- raise IndexError("only integers, slices (`:`), "
- "ellipsis (`...`), numpy.newaxis (`None`) "
- "and integer or boolean "
- "arrays are valid indices")
+ raise IndexError(
+ "only integers, slices (`:`), "
+ "ellipsis (`...`), numpy.newaxis (`None`) "
+ "and integer or boolean "
+ "arrays are valid indices"
+ )
# fall back to Int64Index
return super().__getitem__(key)
@@ -659,9 +690,7 @@ def __floordiv__(self, other):
return NotImplemented
if is_integer(other) and other != 0:
- if (len(self) == 0 or
- self.start % other == 0 and
- self.step % other == 0):
+ if len(self) == 0 or self.start % other == 0 and self.step % other == 0:
start = self.start // other
step = self.step // other
stop = start + len(self) * step
@@ -717,7 +746,7 @@ def _evaluate_numeric_binop(self, other):
try:
# apply if we have an override
if step:
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
rstep = step(left.step, right)
# we don't have a representable op
@@ -728,7 +757,7 @@ def _evaluate_numeric_binop(self, other):
else:
rstep = left.step
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
rstart = op(left.start, right)
rstop = op(left.stop, right)
@@ -737,9 +766,8 @@ def _evaluate_numeric_binop(self, other):
# for compat with numpy / Int64Index
# even if we can represent as a RangeIndex, return
# as a Float64Index if we have float-like descriptors
- if not all(is_integer(x) for x in
- [rstart, rstop, rstep]):
- result = result.astype('float64')
+ if not all(is_integer(x) for x in [rstart, rstop, rstep]):
+ result = result.astype("float64")
return result
@@ -748,7 +776,7 @@ def _evaluate_numeric_binop(self, other):
return op(self._int64index, other)
# TODO: Do attrs get handled reliably?
- name = '__{name}__'.format(name=op.__name__)
+ name = "__{name}__".format(name=op.__name__)
return compat.set_function_name(_evaluate_numeric_binop, name, cls)
cls.__add__ = _make_evaluate_binop(operator.add)
@@ -757,10 +785,8 @@ def _evaluate_numeric_binop(self, other):
cls.__rsub__ = _make_evaluate_binop(ops.rsub)
cls.__mul__ = _make_evaluate_binop(operator.mul, step=operator.mul)
cls.__rmul__ = _make_evaluate_binop(ops.rmul, step=ops.rmul)
- cls.__truediv__ = _make_evaluate_binop(operator.truediv,
- step=operator.truediv)
- cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv,
- step=ops.rtruediv)
+ cls.__truediv__ = _make_evaluate_binop(operator.truediv, step=operator.truediv)
+ cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv, step=ops.rtruediv)
RangeIndex._add_numeric_methods()
diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
index ba5507fa71e8c..29ed3c6b97318 100644
--- a/pandas/core/indexes/timedeltas.py
+++ b/pandas/core/indexes/timedeltas.py
@@ -4,13 +4,20 @@
import numpy as np
-from pandas._libs import (
- NaT, Timedelta, index as libindex, join as libjoin, lib)
+from pandas._libs import NaT, Timedelta, index as libindex, join as libjoin, lib
from pandas.util._decorators import Appender, Substitution
from pandas.core.dtypes.common import (
- _TD_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar,
- is_timedelta64_dtype, is_timedelta64_ns_dtype, pandas_dtype)
+ _TD_DTYPE,
+ ensure_int64,
+ is_float,
+ is_integer,
+ is_list_like,
+ is_scalar,
+ is_timedelta64_dtype,
+ is_timedelta64_ns_dtype,
+ pandas_dtype,
+)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.missing import isna
@@ -21,8 +28,11 @@
import pandas.core.common as com
from pandas.core.indexes.base import Index, _index_shared_docs
from pandas.core.indexes.datetimelike import (
- DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, maybe_unwrap_index,
- wrap_arithmetic_op)
+ DatetimeIndexOpsMixin,
+ DatetimelikeDelegateMixin,
+ maybe_unwrap_index,
+ wrap_arithmetic_op,
+)
from pandas.core.indexes.numeric import Int64Index
from pandas.core.ops import get_op_result_name
@@ -47,28 +57,24 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin):
# We also have a few "extra" attrs, which may or may not be raw,
# which we we dont' want to expose in the .dt accessor.
_delegate_class = TimedeltaArray
- _delegated_properties = (TimedeltaArray._datetimelike_ops + [
- 'components',
- ])
- _delegated_methods = TimedeltaArray._datetimelike_methods + [
- '_box_values',
- ]
- _raw_properties = {
- 'components',
- }
- _raw_methods = {
- 'to_pytimedelta',
- }
-
-
-@delegate_names(TimedeltaArray,
- TimedeltaDelegateMixin._delegated_properties,
- typ="property")
-@delegate_names(TimedeltaArray,
- TimedeltaDelegateMixin._delegated_methods,
- typ="method", overwrite=False)
-class TimedeltaIndex(DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index,
- TimedeltaDelegateMixin):
+ _delegated_properties = TimedeltaArray._datetimelike_ops + ["components"]
+ _delegated_methods = TimedeltaArray._datetimelike_methods + ["_box_values"]
+ _raw_properties = {"components"}
+ _raw_methods = {"to_pytimedelta"}
+
+
+@delegate_names(
+ TimedeltaArray, TimedeltaDelegateMixin._delegated_properties, typ="property"
+)
+@delegate_names(
+ TimedeltaArray,
+ TimedeltaDelegateMixin._delegated_methods,
+ typ="method",
+ overwrite=False,
+)
+class TimedeltaIndex(
+ DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, TimedeltaDelegateMixin
+):
"""
Immutable ndarray of timedelta64 data, represented internally as int64, and
which can be boxed to timedelta objects
@@ -148,23 +154,23 @@ class TimedeltaIndex(DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index,
been deprecated in favor of :func:`timedelta_range`.
"""
- _typ = 'timedeltaindex'
+ _typ = "timedeltaindex"
_join_precedence = 10
def _join_i8_wrapper(joinf, **kwargs):
- return DatetimeIndexOpsMixin._join_i8_wrapper(
- joinf, dtype='m8[ns]', **kwargs)
+ return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="m8[ns]", **kwargs)
_inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64)
_outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64)
_left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64)
_left_indexer_unique = _join_i8_wrapper(
- libjoin.left_join_indexer_unique_int64, with_indexers=False)
+ libjoin.left_join_indexer_unique_int64, with_indexers=False
+ )
_engine_type = libindex.TimedeltaEngine
- _comparables = ['name', 'freq']
- _attributes = ['name', 'freq']
+ _comparables = ["name", "freq"]
+ _attributes = ["name", "freq"]
_is_numeric_dtype = True
_infer_as_myclass = True
@@ -181,44 +187,67 @@ def _join_i8_wrapper(joinf, **kwargs):
# -------------------------------------------------------------------
# Constructors
- def __new__(cls, data=None, unit=None, freq=None, start=None, end=None,
- periods=None, closed=None, dtype=_TD_DTYPE, copy=False,
- name=None, verify_integrity=None):
+ def __new__(
+ cls,
+ data=None,
+ unit=None,
+ freq=None,
+ start=None,
+ end=None,
+ periods=None,
+ closed=None,
+ dtype=_TD_DTYPE,
+ copy=False,
+ name=None,
+ verify_integrity=None,
+ ):
if verify_integrity is not None:
- warnings.warn("The 'verify_integrity' argument is deprecated, "
- "will be removed in a future version.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "The 'verify_integrity' argument is deprecated, "
+ "will be removed in a future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
else:
verify_integrity = True
if data is None:
freq, freq_infer = dtl.maybe_infer_freq(freq)
- warnings.warn("Creating a TimedeltaIndex by passing range "
- "endpoints is deprecated. Use "
- "`pandas.timedelta_range` instead.",
- FutureWarning, stacklevel=2)
- result = TimedeltaArray._generate_range(start, end, periods, freq,
- closed=closed)
+ warnings.warn(
+ "Creating a TimedeltaIndex by passing range "
+ "endpoints is deprecated. Use "
+ "`pandas.timedelta_range` instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
+ result = TimedeltaArray._generate_range(
+ start, end, periods, freq, closed=closed
+ )
return cls._simple_new(result._data, freq=freq, name=name)
if is_scalar(data):
- raise TypeError('{cls}() must be called with a '
- 'collection of some kind, {data} was passed'
- .format(cls=cls.__name__, data=repr(data)))
-
- if unit in {'Y', 'y', 'M'}:
- warnings.warn("M and Y units are deprecated and "
- "will be removed in a future version.",
- FutureWarning, stacklevel=2)
+ raise TypeError(
+ "{cls}() must be called with a "
+ "collection of some kind, {data} was passed".format(
+ cls=cls.__name__, data=repr(data)
+ )
+ )
+
+ if unit in {"Y", "y", "M"}:
+ warnings.warn(
+ "M and Y units are deprecated and "
+ "will be removed in a future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
if isinstance(data, TimedeltaArray):
if copy:
data = data.copy()
return cls._simple_new(data, name=name, freq=freq)
- if (isinstance(data, TimedeltaIndex) and
- freq is None and name is None):
+ if isinstance(data, TimedeltaIndex) and freq is None and name is None:
if copy:
return data.copy()
else:
@@ -226,8 +255,9 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None,
# - Cases checked above all return/raise before reaching here - #
- tdarr = TimedeltaArray._from_sequence(data, freq=freq, unit=unit,
- dtype=dtype, copy=copy)
+ tdarr = TimedeltaArray._from_sequence(
+ data, freq=freq, unit=unit, dtype=dtype, copy=copy
+ )
return cls._simple_new(tdarr._data, freq=tdarr.freq, name=name)
@classmethod
@@ -235,14 +265,13 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE):
# `dtype` is passed by _shallow_copy in corner cases, should always
# be timedelta64[ns] if present
if not isinstance(values, TimedeltaArray):
- values = TimedeltaArray._simple_new(values, dtype=dtype,
- freq=freq)
+ values = TimedeltaArray._simple_new(values, dtype=dtype, freq=freq)
else:
if freq is None:
freq = values.freq
assert isinstance(values, TimedeltaArray), type(values)
assert dtype == _TD_DTYPE, dtype
- assert values.dtype == 'm8[ns]', values.dtype
+ assert values.dtype == "m8[ns]", values.dtype
tdarr = TimedeltaArray._simple_new(values._data, freq=freq)
result = object.__new__(cls)
@@ -262,14 +291,15 @@ def __setstate__(self, state):
super().__setstate__(state)
else:
raise Exception("invalid pickle state")
+
_unpickle_compat = __setstate__
def _maybe_update_attributes(self, attrs):
""" Update Index attributes (e.g. freq) depending on op """
- freq = attrs.get('freq', None)
+ freq = attrs.get("freq", None)
if freq is not None:
# no need to infer if freq is None
- attrs['freq'] = 'infer'
+ attrs["freq"] = "infer"
return attrs
# -------------------------------------------------------------------
@@ -278,13 +308,15 @@ def _maybe_update_attributes(self, attrs):
@property
def _formatter_func(self):
from pandas.io.formats.format import _get_format_timedelta64
+
return _get_format_timedelta64(self, box=True)
- def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs):
+ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs):
from pandas.io.formats.format import Timedelta64Formatter
- return Timedelta64Formatter(values=self,
- nat_rep=na_rep,
- justify='all').get_result()
+
+ return Timedelta64Formatter(
+ values=self, nat_rep=na_rep, justify="all"
+ ).get_result()
# -------------------------------------------------------------------
# Wrapping TimedeltaArray
@@ -307,7 +339,7 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs):
@property
def _box_func(self):
- return lambda x: Timedelta(x, unit='ns')
+ return lambda x: Timedelta(x, unit="ns")
def __getitem__(self, key):
result = self._data.__getitem__(key)
@@ -317,7 +349,7 @@ def __getitem__(self, key):
# -------------------------------------------------------------------
- @Appender(_index_shared_docs['astype'])
+ @Appender(_index_shared_docs["astype"])
def astype(self, dtype, copy=True):
dtype = pandas_dtype(dtype)
if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype):
@@ -327,7 +359,7 @@ def astype(self, dtype, copy=True):
result = self._data.astype(dtype, copy=copy)
if self.hasnans:
return Index(result, name=self.name)
- return Index(result.astype('i8'), name=self.name)
+ return Index(result.astype("i8"), name=self.name)
return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy)
def _union(self, other, sort):
@@ -350,8 +382,7 @@ def _union(self, other, sort):
result.freq = to_offset(result.inferred_freq)
return result
- def join(self, other, how='left', level=None, return_indexers=False,
- sort=False):
+ def join(self, other, how="left", level=None, return_indexers=False, sort=False):
"""
See Index.join
"""
@@ -361,9 +392,14 @@ def join(self, other, how='left', level=None, return_indexers=False,
except (TypeError, ValueError):
pass
- return Index.join(self, other, how=how, level=level,
- return_indexers=return_indexers,
- sort=sort)
+ return Index.join(
+ self,
+ other,
+ how=how,
+ level=level,
+ return_indexers=return_indexers,
+ sort=sort,
+ )
def intersection(self, other, sort=False):
"""
@@ -395,8 +431,11 @@ def intersection(self, other, sort=False):
def _wrap_joined_index(self, joined, other):
name = get_op_result_name(self, other)
- if (isinstance(other, TimedeltaIndex) and self.freq == other.freq and
- self._can_fast_union(other)):
+ if (
+ isinstance(other, TimedeltaIndex)
+ and self.freq == other.freq
+ and self._can_fast_union(other)
+ ):
joined = self._shallow_copy(joined, name=name)
return joined
else:
@@ -447,7 +486,7 @@ def _fast_union(self, other):
# concatenate
if left_end < right_end:
- loc = right.searchsorted(left_end, side='right')
+ loc = right.searchsorted(left_end, side="right")
right_chunk = right.values[loc:]
dates = _concat._concat_compat((left.values, right_chunk))
return self._shallow_copy(dates)
@@ -455,7 +494,7 @@ def _fast_union(self, other):
return left
def _maybe_promote(self, other):
- if other.inferred_type == 'timedelta':
+ if other.inferred_type == "timedelta":
other = TimedeltaIndex(other)
return self, other
@@ -470,8 +509,7 @@ def get_value(self, series, key):
return self.get_value_maybe_box(series, key)
try:
- return com.maybe_box(self, Index.get_value(self, series, key),
- series, key)
+ return com.maybe_box(self, Index.get_value(self, series, key), series, key)
except KeyError:
try:
loc = self._get_string_slice(key)
@@ -547,25 +585,23 @@ def _maybe_cast_slice_bound(self, label, side, kind):
label : object
"""
- assert kind in ['ix', 'loc', 'getitem', None]
+ assert kind in ["ix", "loc", "getitem", None]
if isinstance(label, str):
parsed = Timedelta(label)
lbound = parsed.round(parsed.resolution_string)
- if side == 'left':
+ if side == "left":
return lbound
else:
- return (lbound + to_offset(parsed.resolution_string) -
- Timedelta(1, 'ns'))
- elif ((is_integer(label) or is_float(label)) and
- not is_timedelta64_dtype(label)):
- self._invalid_indexer('slice', label)
+ return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns")
+ elif (is_integer(label) or is_float(label)) and not is_timedelta64_dtype(label):
+ self._invalid_indexer("slice", label)
return label
def _get_string_slice(self, key):
if is_integer(key) or is_float(key) or key is NaT:
- self._invalid_indexer('slice', key)
+ self._invalid_indexer("slice", key)
loc = self._partial_td_slice(key)
return loc
@@ -577,9 +613,9 @@ def _partial_td_slice(self, key):
raise NotImplementedError
- @Substitution(klass='TimedeltaIndex')
- @Appender(_shared_docs['searchsorted'])
- def searchsorted(self, value, side='left', sorter=None):
+ @Substitution(klass="TimedeltaIndex")
+ @Appender(_shared_docs["searchsorted"])
+ def searchsorted(self, value, side="left", sorter=None):
if isinstance(value, (np.ndarray, Index)):
value = np.array(value, dtype=_TD_DTYPE, copy=False)
else:
@@ -588,11 +624,11 @@ def searchsorted(self, value, side='left', sorter=None):
return self.values.searchsorted(value, side=side, sorter=sorter)
def is_type_compatible(self, typ):
- return typ == self.inferred_type or typ == 'timedelta'
+ return typ == self.inferred_type or typ == "timedelta"
@property
def inferred_type(self):
- return 'timedelta64'
+ return "timedelta64"
@property
def is_all_dates(self):
@@ -628,16 +664,16 @@ def insert(self, loc, item):
# check freq can be preserved on edge cases
if self.freq is not None:
- if ((loc == 0 or loc == -len(self)) and
- item + self.freq == self[0]):
+ if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]:
freq = self.freq
elif (loc == len(self)) and item - self.freq == self[-1]:
freq = self.freq
item = Timedelta(item).asm8.view(_TD_DTYPE)
try:
- new_tds = np.concatenate((self[:loc].asi8, [item.view(np.int64)],
- self[loc:].asi8))
+ new_tds = np.concatenate(
+ (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8)
+ )
return self._shallow_copy(new_tds, freq=freq)
except (AttributeError, TypeError):
@@ -645,8 +681,7 @@ def insert(self, loc, item):
# fall back to object index
if isinstance(item, str):
return self.astype(object).insert(loc, item)
- raise TypeError(
- "cannot insert TimedeltaIndex with incompatible label")
+ raise TypeError("cannot insert TimedeltaIndex with incompatible label")
def delete(self, loc):
"""
@@ -663,16 +698,15 @@ def delete(self, loc):
"""
new_tds = np.delete(self.asi8, loc)
- freq = 'infer'
+ freq = "infer"
if is_integer(loc):
if loc in (0, -len(self), -1, len(self) - 1):
freq = self.freq
else:
if is_list_like(loc):
- loc = lib.maybe_indices_to_slice(
- ensure_int64(np.array(loc)), len(self))
+ loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self))
if isinstance(loc, slice) and loc.step in (1, None):
- if (loc.start in (0, None) or loc.stop in (len(self), None)):
+ if loc.start in (0, None) or loc.stop in (len(self), None):
freq = self.freq
return TimedeltaIndex(new_tds, name=self.name, freq=freq)
@@ -690,15 +724,20 @@ def _is_convertible_to_index(other):
"""
if isinstance(other, TimedeltaIndex):
return True
- elif (len(other) > 0 and
- other.inferred_type not in ('floating', 'mixed-integer', 'integer',
- 'mixed-integer-float', 'mixed')):
+ elif len(other) > 0 and other.inferred_type not in (
+ "floating",
+ "mixed-integer",
+ "integer",
+ "mixed-integer-float",
+ "mixed",
+ ):
return True
return False
-def timedelta_range(start=None, end=None, periods=None, freq=None,
- name=None, closed=None):
+def timedelta_range(
+ start=None, end=None, periods=None, freq=None, name=None, closed=None
+):
"""
Return a fixed frequency TimedeltaIndex, with day as the default
frequency
@@ -765,9 +804,8 @@ def timedelta_range(start=None, end=None, periods=None, freq=None,
dtype='timedelta64[ns]', freq=None)
"""
if freq is None and com._any_none(periods, start, end):
- freq = 'D'
+ freq = "D"
freq, freq_infer = dtl.maybe_infer_freq(freq)
- tdarr = TimedeltaArray._generate_range(start, end, periods, freq,
- closed=closed)
+ tdarr = TimedeltaArray._generate_range(start, end, periods, freq, closed=closed)
return TimedeltaIndex._simple_new(tdarr._data, freq=tdarr.freq, name=name)
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index 677aefa15d200..ccc3a027af70d 100755
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -9,8 +9,17 @@
from pandas.util._decorators import Appender
from pandas.core.dtypes.common import (
- ensure_platform_int, is_float, is_integer, is_integer_dtype, is_iterator,
- is_list_like, is_numeric_dtype, is_scalar, is_sequence, is_sparse)
+ ensure_platform_int,
+ is_float,
+ is_integer,
+ is_integer_dtype,
+ is_iterator,
+ is_list_like,
+ is_numeric_dtype,
+ is_scalar,
+ is_sequence,
+ is_sparse,
+)
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
from pandas.core.dtypes.missing import _infer_fill_value, isna
@@ -22,11 +31,11 @@
def get_indexers_list():
return [
- ('ix', _IXIndexer),
- ('iloc', _iLocIndexer),
- ('loc', _LocIndexer),
- ('at', _AtIndexer),
- ('iat', _iAtIndexer),
+ ("ix", _IXIndexer),
+ ("iloc", _iLocIndexer),
+ ("loc", _LocIndexer),
+ ("at", _AtIndexer),
+ ("iat", _iAtIndexer),
]
@@ -102,12 +111,11 @@ def __call__(self, axis=None):
return new_self
def __iter__(self):
- raise NotImplementedError('ix is not iterable')
+ raise NotImplementedError("ix is not iterable")
def __getitem__(self, key):
if type(key) is tuple:
- key = tuple(com.apply_if_callable(x, self.obj)
- for x in key)
+ key = tuple(com.apply_if_callable(x, self.obj) for x in key)
try:
values = self.obj._get_value(*key)
if is_scalar(values):
@@ -134,7 +142,7 @@ def _get_label(self, label, axis=None):
# see GH5667
return self.obj._xs(label, axis=axis)
elif isinstance(label, tuple) and isinstance(label[axis], slice):
- raise IndexingError('no slices here, handle elsewhere')
+ raise IndexingError("no slices here, handle elsewhere")
return self.obj._xs(label, axis=axis)
@@ -154,7 +162,7 @@ def _get_setitem_indexer(self, key):
axis = self.obj._get_axis(0)
- if isinstance(axis, MultiIndex) and self.name != 'iloc':
+ if isinstance(axis, MultiIndex) and self.name != "iloc":
try:
return axis.get_loc(key)
except Exception:
@@ -174,14 +182,13 @@ def _get_setitem_indexer(self, key):
except TypeError as e:
# invalid indexer type vs 'other' indexing errors
- if 'cannot do' in str(e):
+ if "cannot do" in str(e):
raise
raise IndexingError(key)
def __setitem__(self, key, value):
if isinstance(key, tuple):
- key = tuple(com.apply_if_callable(x, self.obj)
- for x in key)
+ key = tuple(com.apply_if_callable(x, self.obj) for x in key)
else:
key = com.apply_if_callable(key, self.obj)
indexer = self._get_setitem_indexer(key)
@@ -216,13 +223,14 @@ def _has_valid_tuple(self, key):
""" check the key for valid keys across my indexer """
for i, k in enumerate(key):
if i >= self.obj.ndim:
- raise IndexingError('Too many indexers')
+ raise IndexingError("Too many indexers")
try:
self._validate_key(k, i)
except ValueError:
- raise ValueError("Location based indexing can only have "
- "[{types}] types"
- .format(types=self._valid_types))
+ raise ValueError(
+ "Location based indexing can only have "
+ "[{types}] types".format(types=self._valid_types)
+ )
def _is_nested_tuple_indexer(self, tup):
if any(isinstance(ax, MultiIndex) for ax in self.obj.axes):
@@ -235,14 +243,15 @@ def _convert_tuple(self, key, is_setter=False):
axis = self.obj._get_axis_number(self.axis)
for i in range(self.ndim):
if i == axis:
- keyidx.append(self._convert_to_indexer(
- key, axis=axis, is_setter=is_setter))
+ keyidx.append(
+ self._convert_to_indexer(key, axis=axis, is_setter=is_setter)
+ )
else:
keyidx.append(slice(None))
else:
for i, k in enumerate(key):
if i >= self.obj.ndim:
- raise IndexingError('Too many indexers')
+ raise IndexingError("Too many indexers")
idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter)
keyidx.append(idx)
return tuple(keyidx)
@@ -272,8 +281,7 @@ def _has_valid_positional_setitem_indexer(self, indexer):
will raise if needed, does not modify the indexer externally
"""
if isinstance(indexer, dict):
- raise IndexError("{0} cannot enlarge its target object"
- .format(self.name))
+ raise IndexError("{0} cannot enlarge its target object".format(self.name))
else:
if not isinstance(indexer, tuple):
indexer = self._tuplify(indexer)
@@ -286,11 +294,14 @@ def _has_valid_positional_setitem_indexer(self, indexer):
pass
elif is_integer(i):
if i >= len(ax):
- raise IndexError("{name} cannot enlarge its target "
- "object".format(name=self.name))
+ raise IndexError(
+ "{name} cannot enlarge its target "
+ "object".format(name=self.name)
+ )
elif isinstance(i, dict):
- raise IndexError("{name} cannot enlarge its target object"
- .format(name=self.name))
+ raise IndexError(
+ "{name} cannot enlarge its target object".format(name=self.name)
+ )
return True
@@ -299,6 +310,7 @@ def _setitem_with_indexer(self, indexer, value):
# also has the side effect of consolidating in-place
from pandas import Series
+
info_axis = self.obj._info_axis_number
# maybe partial set
@@ -309,8 +321,7 @@ def _setitem_with_indexer(self, indexer, value):
if not take_split_path and self.obj._data.blocks:
blk, = self.obj._data.blocks
if 1 < blk.ndim: # in case of dict, keys are indices
- val = list(value.values()) if isinstance(value,
- dict) else value
+ val = list(value.values()) if isinstance(value, dict) else value
take_split_path = not blk._can_hold_element(val)
if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes):
@@ -320,8 +331,9 @@ def _setitem_with_indexer(self, indexer, value):
# if we have any multi-indexes that have non-trivial slices
# (not null slices) then we must take the split path, xref
# GH 10360
- if (isinstance(ax, MultiIndex) and
- not (is_integer(i) or com.is_null_slice(i))):
+ if isinstance(ax, MultiIndex) and not (
+ is_integer(i) or com.is_null_slice(i)
+ ):
take_split_path = True
break
@@ -346,13 +358,14 @@ def _setitem_with_indexer(self, indexer, value):
# or a list-like on the non-info axes if we have a
# list-like
len_non_info_axes = (
- len(_ax) for _i, _ax in enumerate(self.obj.axes)
- if _i != i
+ len(_ax) for _i, _ax in enumerate(self.obj.axes) if _i != i
)
if any(not l for l in len_non_info_axes):
if not is_list_like_indexer(value):
- raise ValueError("cannot set a frame with no "
- "defined index and a scalar")
+ raise ValueError(
+ "cannot set a frame with no "
+ "defined index and a scalar"
+ )
self.obj[key] = value
return self.obj
@@ -360,7 +373,8 @@ def _setitem_with_indexer(self, indexer, value):
self.obj[key] = _infer_fill_value(value)
new_indexer = convert_from_missing_indexer_tuple(
- indexer, self.obj.axes)
+ indexer, self.obj.axes
+ )
self._setitem_with_indexer(new_indexer, value)
return self.obj
@@ -402,21 +416,19 @@ def _setitem_with_indexer(self, indexer, value):
if index.is_unique:
new_indexer = index.get_indexer([new_index[-1]])
if (new_indexer != -1).any():
- return self._setitem_with_indexer(new_indexer,
- value)
+ return self._setitem_with_indexer(new_indexer, value)
# this preserves dtype of the value
new_values = Series([value])._values
if len(self.obj._values):
try:
- new_values = np.concatenate([self.obj._values,
- new_values])
+ new_values = np.concatenate([self.obj._values, new_values])
except TypeError:
as_obj = self.obj.astype(object)
- new_values = np.concatenate([as_obj,
- new_values])
+ new_values = np.concatenate([as_obj, new_values])
self.obj._data = self.obj._constructor(
- new_values, index=new_index, name=self.obj.name)._data
+ new_values, index=new_index, name=self.obj.name
+ )._data
self.obj._maybe_update_cacher(clear=True)
return self.obj
@@ -424,14 +436,14 @@ def _setitem_with_indexer(self, indexer, value):
# no columns and scalar
if not len(self.obj.columns):
- raise ValueError("cannot set a frame with no defined "
- "columns")
+ raise ValueError(
+ "cannot set a frame with no defined " "columns"
+ )
# append a Series
if isinstance(value, Series):
- value = value.reindex(index=self.obj.columns,
- copy=True)
+ value = value.reindex(index=self.obj.columns, copy=True)
value.name = indexer
# a list-list
@@ -440,11 +452,11 @@ def _setitem_with_indexer(self, indexer, value):
# must have conforming columns
if is_list_like_indexer(value):
if len(value) != len(self.obj.columns):
- raise ValueError("cannot set a row with "
- "mismatched columns")
+ raise ValueError(
+ "cannot set a row with " "mismatched columns"
+ )
- value = Series(value, index=self.obj.columns,
- name=indexer)
+ value = Series(value, index=self.obj.columns, name=indexer)
self.obj._data = self.obj.append(value)._data
self.obj._maybe_update_cacher(clear=True)
@@ -469,46 +481,48 @@ def _setitem_with_indexer(self, indexer, value):
# if we have a partial multiindex, then need to adjust the plane
# indexer here
- if (len(labels) == 1 and
- isinstance(self.obj[labels[0]].axes[0], MultiIndex)):
+ if len(labels) == 1 and isinstance(self.obj[labels[0]].axes[0], MultiIndex):
item = labels[0]
obj = self.obj[item]
index = obj.index
idx = indexer[:info_axis][0]
- plane_indexer = tuple([idx]) + indexer[info_axis + 1:]
+ plane_indexer = tuple([idx]) + indexer[info_axis + 1 :]
lplane_indexer = length_of_indexer(plane_indexer[0], index)
# require that we are setting the right number of values that
# we are indexing
- if is_list_like_indexer(value) and np.iterable(
- value) and lplane_indexer != len(value):
+ if (
+ is_list_like_indexer(value)
+ and np.iterable(value)
+ and lplane_indexer != len(value)
+ ):
if len(obj[idx]) != len(value):
- raise ValueError("cannot set using a multi-index "
- "selection indexer with a different "
- "length than the value")
+ raise ValueError(
+ "cannot set using a multi-index "
+ "selection indexer with a different "
+ "length than the value"
+ )
# make sure we have an ndarray
- value = getattr(value, 'values', value).ravel()
+ value = getattr(value, "values", value).ravel()
# we can directly set the series here
# as we select a slice indexer on the mi
idx = index._convert_slice_indexer(idx)
obj._consolidate_inplace()
obj = obj.copy()
- obj._data = obj._data.setitem(indexer=tuple([idx]),
- value=value)
+ obj._data = obj._data.setitem(indexer=tuple([idx]), value=value)
self.obj[item] = obj
return
# non-mi
else:
- plane_indexer = indexer[:info_axis] + indexer[info_axis + 1:]
+ plane_indexer = indexer[:info_axis] + indexer[info_axis + 1 :]
if info_axis > 0:
plane_axis = self.obj.axes[:info_axis][0]
- lplane_indexer = length_of_indexer(plane_indexer[0],
- plane_axis)
+ lplane_indexer = length_of_indexer(plane_indexer[0], plane_axis)
else:
lplane_indexer = 0
@@ -521,10 +535,10 @@ def setter(item, v):
# which means essentially reassign to the columns of a
# multi-dim object
# GH6149 (null slice), GH10408 (full bounds)
- if (isinstance(pi, tuple) and
- all(com.is_null_slice(idx) or
- com.is_full_slice(idx, len(self.obj))
- for idx in pi)):
+ if isinstance(pi, tuple) and all(
+ com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj))
+ for idx in pi
+ ):
s = v
else:
# set the item, possibly having a dtype change
@@ -538,8 +552,11 @@ def setter(item, v):
def can_do_equal_len():
""" return True if we have an equal len settable """
- if (not len(labels) == 1 or not np.iterable(value) or
- is_scalar(plane_indexer[0])):
+ if (
+ not len(labels) == 1
+ or not np.iterable(value)
+ or is_scalar(plane_indexer[0])
+ ):
return False
item = labels[0]
@@ -556,7 +573,7 @@ def can_do_equal_len():
# we need an iterable, with a ndim of at least 1
# eg. don't pass through np.array(0)
- if is_list_like_indexer(value) and getattr(value, 'ndim', 1) > 0:
+ if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0:
# we have an equal len Frame
if isinstance(value, ABCDataFrame) and value.ndim > 1:
@@ -567,8 +584,8 @@ def can_do_equal_len():
if item in value:
sub_indexer[info_axis] = item
v = self._align_series(
- tuple(sub_indexer), value[item],
- multiindex_indexer)
+ tuple(sub_indexer), value[item], multiindex_indexer
+ )
else:
v = np.nan
@@ -578,16 +595,18 @@ def can_do_equal_len():
# hasattr first, to avoid coercing to ndarray without reason.
# But we may be relying on the ndarray coercion to check ndim.
# Why not just convert to an ndarray earlier on if needed?
- elif ((hasattr(value, 'ndim') and value.ndim == 2)
- or (not hasattr(value, 'ndim') and
- np.array(value).ndim) == 2):
+ elif (hasattr(value, "ndim") and value.ndim == 2) or (
+ not hasattr(value, "ndim") and np.array(value).ndim
+ ) == 2:
# note that this coerces the dtype if we are mixed
# GH 7551
value = np.array(value, dtype=object)
if len(labels) != value.shape[1]:
- raise ValueError('Must have equal len keys and value '
- 'when setting with an ndarray')
+ raise ValueError(
+ "Must have equal len keys and value "
+ "when setting with an ndarray"
+ )
for i, item in enumerate(labels):
@@ -602,8 +621,10 @@ def can_do_equal_len():
else:
if len(labels) != len(value):
- raise ValueError('Must have equal len keys and value '
- 'when setting with an iterable')
+ raise ValueError(
+ "Must have equal len keys and value "
+ "when setting with an iterable"
+ )
for item, v in zip(labels, value):
setter(item, v)
@@ -620,12 +641,16 @@ def can_do_equal_len():
# if we are setting on the info axis ONLY
# set using those methods to avoid block-splitting
# logic here
- if (len(indexer) > info_axis and
- is_integer(indexer[info_axis]) and
- all(com.is_null_slice(idx)
- for i, idx in enumerate(indexer)
- if i != info_axis) and
- item_labels.is_unique):
+ if (
+ len(indexer) > info_axis
+ and is_integer(indexer[info_axis])
+ and all(
+ com.is_null_slice(idx)
+ for i, idx in enumerate(indexer)
+ if i != info_axis
+ )
+ and item_labels.is_unique
+ ):
self.obj[item_labels[indexer[info_axis]]] = value
return
@@ -643,8 +668,7 @@ def can_do_equal_len():
# actually do the set
self.obj._consolidate_inplace()
- self.obj._data = self.obj._data.setitem(indexer=indexer,
- value=value)
+ self.obj._data = self.obj._data.setitem(indexer=indexer, value=value)
self.obj._maybe_update_cacher(clear=True)
def _align_series(self, indexer, ser, multiindex_indexer=False):
@@ -677,6 +701,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False):
# flatten np.ndarray indexers
def ravel(i):
return i.ravel() if isinstance(i, np.ndarray) else i
+
indexer = tuple(map(ravel, indexer))
aligners = [not com.is_null_slice(idx) for idx in indexer]
@@ -696,8 +721,7 @@ def ravel(i):
# we have a frame, with multiple indexers on both axes; and a
# series, so need to broadcast (see GH5206)
- if (sum_aligners == self.ndim and
- all(is_sequence(_) for _ in indexer)):
+ if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer):
ser = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values
# single indexer
@@ -741,7 +765,7 @@ def ravel(i):
return ser.reindex(ax)._values
- raise ValueError('Incompatible indexer with Series')
+ raise ValueError("Incompatible indexer with Series")
def _align_frame(self, indexer, df):
is_frame = self.obj.ndim == 2
@@ -772,8 +796,7 @@ def _align_frame(self, indexer, df):
val = df.reindex(idx, columns=cols)._values
return val
- elif ((isinstance(indexer, slice) or is_list_like_indexer(indexer)) and
- is_frame):
+ elif (isinstance(indexer, slice) or is_list_like_indexer(indexer)) and is_frame:
ax = self.obj.index[indexer]
if df.index.equals(ax):
val = df.copy()._values
@@ -781,16 +804,20 @@ def _align_frame(self, indexer, df):
# we have a multi-index and are trying to align
# with a particular, level GH3738
- if (isinstance(ax, MultiIndex) and
- isinstance(df.index, MultiIndex) and
- ax.nlevels != df.index.nlevels):
- raise TypeError("cannot align on a multi-index with out "
- "specifying the join levels")
+ if (
+ isinstance(ax, MultiIndex)
+ and isinstance(df.index, MultiIndex)
+ and ax.nlevels != df.index.nlevels
+ ):
+ raise TypeError(
+ "cannot align on a multi-index with out "
+ "specifying the join levels"
+ )
val = df.reindex(index=ax)._values
return val
- raise ValueError('Incompatible indexer with DataFrame')
+ raise ValueError("Incompatible indexer with DataFrame")
def _getitem_tuple(self, tup):
try:
@@ -809,7 +836,7 @@ def _getitem_tuple(self, tup):
retval = self.obj
for i, key in enumerate(tup):
if i >= self.obj.ndim:
- raise IndexingError('Too many indexers')
+ raise IndexingError("Too many indexers")
if com.is_null_slice(key):
continue
@@ -859,8 +886,10 @@ def _multi_take(self, tup):
"""
# GH 836
o = self.obj
- d = {axis: self._get_listlike_indexer(key, axis)
- for (key, axis) in zip(tup, o._AXIS_ORDERS)}
+ d = {
+ axis: self._get_listlike_indexer(key, axis)
+ for (key, axis) in zip(tup, o._AXIS_ORDERS)
+ }
return o._reindex_with_indexers(d, copy=True, allow_dups=True)
def _convert_for_reindex(self, key, axis=None):
@@ -878,8 +907,7 @@ def _handle_lowerdim_multi_index_axis0(self, tup):
except KeyError as ek:
# raise KeyError if number of indexers match
# else IndexingError will be raised
- if (len(tup) <= self.obj.index.nlevels
- and len(tup) > self.obj.ndim):
+ if len(tup) <= self.obj.index.nlevels and len(tup) > self.obj.ndim:
raise ek
except Exception as e1:
if isinstance(tup[0], (slice, Index)):
@@ -907,7 +935,7 @@ def _getitem_lowerdim(self, tup):
ax0 = self.obj._get_axis(0)
# ...but iloc should handle the tuple as simple integer-location
# instead of checking it as multiindex representation (GH 13797)
- if isinstance(ax0, MultiIndex) and self.name != 'iloc':
+ if isinstance(ax0, MultiIndex) and self.name != "iloc":
result = self._handle_lowerdim_multi_index_axis0(tup)
if result is not None:
return result
@@ -929,15 +957,18 @@ def _getitem_lowerdim(self, tup):
elif section.ndim == self.ndim:
# we're in the middle of slicing through a MultiIndex
# revise the key wrt to `section` by inserting an _NS
- new_key = tup[:i] + (_NS,) + tup[i + 1:]
+ new_key = tup[:i] + (_NS,) + tup[i + 1 :]
else:
- new_key = tup[:i] + tup[i + 1:]
+ new_key = tup[:i] + tup[i + 1 :]
# unfortunately need an odious kludge here because of
# DataFrame transposing convention
- if (isinstance(section, ABCDataFrame) and i > 0 and
- len(new_key) == 2):
+ if (
+ isinstance(section, ABCDataFrame)
+ and i > 0
+ and len(new_key) == 2
+ ):
a, b = new_key
new_key = b, a
@@ -951,7 +982,7 @@ def _getitem_lowerdim(self, tup):
# This is an elided recursive call to iloc/loc/etc'
return getattr(section, self.name)[new_key]
- raise IndexingError('not applicable')
+ raise IndexingError("not applicable")
def _getitem_nested_tuple(self, tup):
# we have a nested tuple so have at least 1 multi-index level
@@ -984,7 +1015,7 @@ def _getitem_nested_tuple(self, tup):
axis += 1
# if we have a scalar, we are done
- if is_scalar(obj) or not hasattr(obj, 'ndim'):
+ if is_scalar(obj) or not hasattr(obj, "ndim"):
break
# has the dim of the obj changed?
@@ -1006,12 +1037,12 @@ def _getitem_axis(self, key, axis=None):
labels = self.obj._get_axis(axis)
if isinstance(key, slice):
return self._get_slice_axis(key, axis=axis)
- elif (is_list_like_indexer(key) and
- not (isinstance(key, tuple) and
- isinstance(labels, MultiIndex))):
+ elif is_list_like_indexer(key) and not (
+ isinstance(key, tuple) and isinstance(labels, MultiIndex)
+ ):
- if hasattr(key, 'ndim') and key.ndim > 1:
- raise ValueError('Cannot index with multidimensional key')
+ if hasattr(key, "ndim") and key.ndim > 1:
+ raise ValueError("Cannot index with multidimensional key")
return self._getitem_iterable(key, axis=axis)
else:
@@ -1066,15 +1097,13 @@ def _get_listlike_indexer(self, key, axis, raise_missing=False):
# Have the index compute an indexer or return None
# if it cannot handle:
- indexer, keyarr = ax._convert_listlike_indexer(key,
- kind=self.name)
+ indexer, keyarr = ax._convert_listlike_indexer(key, kind=self.name)
# We only act on all found values:
if indexer is not None and (indexer != -1).all():
- self._validate_read_indexer(key, indexer, axis,
- raise_missing=raise_missing)
+ self._validate_read_indexer(key, indexer, axis, raise_missing=raise_missing)
return ax[indexer], indexer
- if ax.is_unique and not getattr(ax, 'is_overlapping', False):
+ if ax.is_unique and not getattr(ax, "is_overlapping", False):
# If we are trying to get actual keys from empty Series, we
# patiently wait for a KeyError later on - otherwise, convert
if len(ax) or not len(key):
@@ -1084,9 +1113,9 @@ def _get_listlike_indexer(self, key, axis, raise_missing=False):
else:
keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
- self._validate_read_indexer(keyarr, indexer,
- o._get_axis_number(axis),
- raise_missing=raise_missing)
+ self._validate_read_indexer(
+ keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
+ )
return keyarr, indexer
def _getitem_iterable(self, key, axis=None):
@@ -1129,10 +1158,10 @@ def _getitem_iterable(self, key, axis=None):
return self.obj._take(inds, axis=axis)
else:
# A collection of keys
- keyarr, indexer = self._get_listlike_indexer(key, axis,
- raise_missing=False)
- return self.obj._reindex_with_indexers({axis: [keyarr, indexer]},
- copy=True, allow_dups=True)
+ keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
+ return self.obj._reindex_with_indexers(
+ {axis: [keyarr, indexer]}, copy=True, allow_dups=True
+ )
def _validate_read_indexer(self, key, indexer, axis, raise_missing=False):
"""
@@ -1171,11 +1200,13 @@ def _validate_read_indexer(self, key, indexer, axis, raise_missing=False):
if missing == len(indexer):
raise KeyError(
"None of [{key}] are in the [{axis}]".format(
- key=key, axis=self.obj._get_axis_name(axis)))
+ key=key, axis=self.obj._get_axis_name(axis)
+ )
+ )
# We (temporarily) allow for some missing keys with .loc, except in
# some cases (e.g. setting) in which "raise_missing" will be False
- if not(self.name == 'loc' and not raise_missing):
+ if not (self.name == "loc" and not raise_missing):
not_found = list(set(key) - set(ax))
raise KeyError("{} not in index".format(not_found))
@@ -1185,19 +1216,19 @@ def _validate_read_indexer(self, key, indexer, axis, raise_missing=False):
# code, so we want to avoid warning & then
# just raising
- _missing_key_warning = textwrap.dedent("""
+ _missing_key_warning = textwrap.dedent(
+ """
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.
See the documentation here:
- https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike""") # noqa
+ https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"""
+ ) # noqa
if not (ax.is_categorical() or ax.is_interval()):
- warnings.warn(_missing_key_warning,
- FutureWarning, stacklevel=6)
+ warnings.warn(_missing_key_warning, FutureWarning, stacklevel=6)
- def _convert_to_indexer(self, obj, axis=None, is_setter=False,
- raise_missing=False):
+ def _convert_to_indexer(self, obj, axis=None, is_setter=False, raise_missing=False):
"""
Convert indexing key into something we can use to do actual fancy
indexing on an ndarray
@@ -1239,7 +1270,7 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False,
except LookupError:
if isinstance(obj, tuple) and isinstance(labels, MultiIndex):
if is_setter and len(obj) == labels.nlevels:
- return {'key': obj}
+ return {"key": obj}
raise
except TypeError:
pass
@@ -1255,14 +1286,14 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False,
if is_setter:
# always valid
- if self.name == 'loc':
- return {'key': obj}
+ if self.name == "loc":
+ return {"key": obj}
# a positional
- if (obj >= self.obj.shape[axis] and
- not isinstance(labels, MultiIndex)):
- raise ValueError("cannot set by positional indexing with "
- "enlargement")
+ if obj >= self.obj.shape[axis] and not isinstance(labels, MultiIndex):
+ raise ValueError(
+ "cannot set by positional indexing with " "enlargement"
+ )
return obj
@@ -1277,8 +1308,7 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False,
return inds
else:
# When setting, missing keys are not allowed, even with .loc:
- kwargs = {'raise_missing': True if is_setter else
- raise_missing}
+ kwargs = {"raise_missing": True if is_setter else raise_missing}
return self._get_listlike_indexer(obj, axis, **kwargs)[1]
else:
try:
@@ -1286,7 +1316,7 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False,
except LookupError:
# allow a not found key only if we are a setter
if not is_list_like_indexer(obj) and is_setter:
- return {'key': obj}
+ return {"key": obj}
raise
def _tuplify(self, loc):
@@ -1305,7 +1335,7 @@ def _get_slice_axis(self, slice_obj, axis=None):
indexer = self._convert_slice_indexer(slice_obj, axis)
if isinstance(indexer, slice):
- return self._slice(indexer, axis=axis, kind='iloc')
+ return self._slice(indexer, axis=axis, kind="iloc")
else:
return self.obj._take(indexer, axis=axis)
@@ -1334,17 +1364,18 @@ class _IXIndexer(_NDFrameIndexer):
See more at :ref:`Advanced Indexing `.
"""
- _ix_deprecation_warning = textwrap.dedent("""
+ _ix_deprecation_warning = textwrap.dedent(
+ """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
- http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated""") # noqa
+ http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated"""
+ ) # noqa
def __init__(self, name, obj):
- warnings.warn(self._ix_deprecation_warning,
- FutureWarning, stacklevel=2)
+ warnings.warn(self._ix_deprecation_warning, FutureWarning, stacklevel=2)
super().__init__(name, obj)
@Appender(_NDFrameIndexer._validate_key.__doc__)
@@ -1413,8 +1444,7 @@ class _LocationIndexer(_NDFrameIndexer):
def __getitem__(self, key):
if type(key) is tuple:
- key = tuple(com.apply_if_callable(x, self.obj)
- for x in key)
+ key = tuple(com.apply_if_callable(x, self.obj) for x in key)
try:
if self._is_scalar_access(key):
return self._getitem_scalar(key)
@@ -1458,11 +1488,12 @@ def _get_slice_axis(self, slice_obj, axis=None):
return obj.copy(deep=False)
labels = obj._get_axis(axis)
- indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop,
- slice_obj.step, kind=self.name)
+ indexer = labels.slice_indexer(
+ slice_obj.start, slice_obj.stop, slice_obj.step, kind=self.name
+ )
if isinstance(indexer, slice):
- return self._slice(indexer, axis=axis, kind='iloc')
+ return self._slice(indexer, axis=axis, kind="iloc")
else:
return self.obj._take(indexer, axis=axis)
@@ -1705,9 +1736,11 @@ class _LocIndexer(_LocationIndexer):
viper mark ii 7 1
"""
- _valid_types = ("labels (MUST BE IN THE INDEX), slices of labels (BOTH "
- "endpoints included! Can be slices of integers if the "
- "index is integers), listlike of labels, boolean")
+ _valid_types = (
+ "labels (MUST BE IN THE INDEX), slices of labels (BOTH "
+ "endpoints included! Can be slices of integers if the "
+ "index is integers), listlike of labels, boolean"
+ )
_exception = KeyError
@Appender(_NDFrameIndexer._validate_key.__doc__)
@@ -1732,7 +1765,7 @@ def _is_scalar_access(self, key):
# that provide the equivalent access of .at and .iat
# a) avoid getting things via sections and (to minimize dtype changes)
# b) provide a performant path
- if not hasattr(key, '__len__'):
+ if not hasattr(key, "__len__"):
return False
if len(key) != self.ndim:
@@ -1761,7 +1794,7 @@ def _get_partial_string_timestamp_match_key(self, key, labels):
"""Translate any partial string timestamp matches in key, returning the
new key (GH 10331)"""
if isinstance(labels, MultiIndex):
- if (isinstance(key, str) and labels.levels[0].is_all_dates):
+ if isinstance(key, str) and labels.levels[0].is_all_dates:
# Convert key '2016-01-01' to
# ('2016-01-01'[, slice(None, None, None)]+)
key = tuple([key] + [slice(None)] * (len(labels.levels) - 1))
@@ -1771,8 +1804,7 @@ def _get_partial_string_timestamp_match_key(self, key, labels):
# (..., slice('2016-01-01', '2016-01-01', None), ...)
new_key = []
for i, component in enumerate(key):
- if (isinstance(component, str) and
- labels.levels[i].is_all_dates):
+ if isinstance(component, str) and labels.levels[i].is_all_dates:
new_key.append(slice(component, component, None))
else:
new_key.append(component)
@@ -1810,23 +1842,30 @@ def _getitem_axis(self, key, axis=None):
key = list(key)
elif isinstance(key, ABCDataFrame):
# GH 15438
- raise NotImplementedError("Indexing a MultiIndex with a "
- "DataFrame key is not "
- "implemented")
- elif hasattr(key, 'ndim') and key.ndim > 1:
- raise NotImplementedError("Indexing a MultiIndex with a "
- "multidimensional key is not "
- "implemented")
-
- if (not isinstance(key, tuple) and len(key) > 1 and
- not isinstance(key[0], tuple)):
+ raise NotImplementedError(
+ "Indexing a MultiIndex with a "
+ "DataFrame key is not "
+ "implemented"
+ )
+ elif hasattr(key, "ndim") and key.ndim > 1:
+ raise NotImplementedError(
+ "Indexing a MultiIndex with a "
+ "multidimensional key is not "
+ "implemented"
+ )
+
+ if (
+ not isinstance(key, tuple)
+ and len(key) > 1
+ and not isinstance(key[0], tuple)
+ ):
key = tuple([key])
# an iterable multi-selection
if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)):
- if hasattr(key, 'ndim') and key.ndim > 1:
- raise ValueError('Cannot index with multidimensional key')
+ if hasattr(key, "ndim") and key.ndim > 1:
+ raise ValueError("Cannot index with multidimensional key")
return self._getitem_iterable(key, axis=axis)
@@ -1978,19 +2017,25 @@ class _iLocIndexer(_LocationIndexer):
2 1000 3000
"""
- _valid_types = ("integer, integer slice (START point is INCLUDED, END "
- "point is EXCLUDED), listlike of integers, boolean array")
+ _valid_types = (
+ "integer, integer slice (START point is INCLUDED, END "
+ "point is EXCLUDED), listlike of integers, boolean array"
+ )
_exception = IndexError
def _validate_key(self, key, axis):
if com.is_bool_indexer(key):
- if hasattr(key, 'index') and isinstance(key.index, Index):
- if key.index.inferred_type == 'integer':
- raise NotImplementedError("iLocation based boolean "
- "indexing on an integer type "
- "is not available")
- raise ValueError("iLocation based boolean indexing cannot use "
- "an indexable as a mask")
+ if hasattr(key, "index") and isinstance(key.index, Index):
+ if key.index.inferred_type == "integer":
+ raise NotImplementedError(
+ "iLocation based boolean "
+ "indexing on an integer type "
+ "is not available"
+ )
+ raise ValueError(
+ "iLocation based boolean indexing cannot use "
+ "an indexable as a mask"
+ )
return
if isinstance(key, slice):
@@ -2000,22 +2045,25 @@ def _validate_key(self, key, axis):
elif isinstance(key, tuple):
# a tuple should already have been caught by this point
# so don't treat a tuple as a valid indexer
- raise IndexingError('Too many indexers')
+ raise IndexingError("Too many indexers")
elif is_list_like_indexer(key):
arr = np.array(key)
len_axis = len(self.obj._get_axis(axis))
# check that the key has a numeric dtype
if not is_numeric_dtype(arr.dtype):
- raise IndexError(".iloc requires numeric indexers, got "
- "{arr}".format(arr=arr))
+ raise IndexError(
+ ".iloc requires numeric indexers, got " "{arr}".format(arr=arr)
+ )
# check that the key does not exceed the maximum size of the index
if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis):
raise IndexError("positional indexers are out-of-bounds")
else:
- raise ValueError("Can only index by location with "
- "a [{types}]".format(types=self._valid_types))
+ raise ValueError(
+ "Can only index by location with "
+ "a [{types}]".format(types=self._valid_types)
+ )
def _has_valid_setitem_indexer(self, indexer):
self._has_valid_positional_setitem_indexer(indexer)
@@ -2025,7 +2073,7 @@ def _is_scalar_access(self, key):
# that provide the equivalent access of .at and .iat
# a) avoid getting things via sections and (to minimize dtype changes)
# b) provide a performant path
- if not hasattr(key, '__len__'):
+ if not hasattr(key, "__len__"):
return False
if len(key) != self.ndim:
@@ -2084,7 +2132,7 @@ def _getitem_tuple(self, tup):
axis = 0
for i, key in enumerate(tup):
if i >= self.obj.ndim:
- raise IndexingError('Too many indexers')
+ raise IndexingError("Too many indexers")
if com.is_null_slice(key):
axis += 1
@@ -2111,7 +2159,7 @@ def _get_slice_axis(self, slice_obj, axis=None):
slice_obj = self._convert_slice_indexer(slice_obj, axis)
if isinstance(slice_obj, slice):
- return self._slice(slice_obj, axis=axis, kind='iloc')
+ return self._slice(slice_obj, axis=axis, kind="iloc")
else:
return self.obj._take(slice_obj, axis=axis)
@@ -2158,8 +2206,9 @@ def _getitem_axis(self, key, axis=None):
else:
key = item_from_zerodim(key)
if not is_integer(key):
- raise TypeError("Cannot index by location index with a "
- "non-integer key")
+ raise TypeError(
+ "Cannot index by location index with a " "non-integer key"
+ )
# validate the location
self._validate_integer(key, axis)
@@ -2182,8 +2231,10 @@ def _convert_to_indexer(self, obj, axis=None, is_setter=False):
self._validate_key(obj, axis)
return obj
except ValueError:
- raise ValueError("Can only index by location with "
- "a [{types}]".format(types=self._valid_types))
+ raise ValueError(
+ "Can only index by location with "
+ "a [{types}]".format(types=self._valid_types)
+ )
class _ScalarAccessIndexer(_NDFrameIndexer):
@@ -2199,15 +2250,14 @@ def __getitem__(self, key):
if not is_list_like_indexer(key):
key = tuple([key])
else:
- raise ValueError('Invalid call for scalar access (getting)!')
+ raise ValueError("Invalid call for scalar access (getting)!")
key = self._convert_key(key)
return self.obj._get_value(*key, takeable=self._takeable)
def __setitem__(self, key, value):
if isinstance(key, tuple):
- key = tuple(com.apply_if_callable(x, self.obj)
- for x in key)
+ key = tuple(com.apply_if_callable(x, self.obj) for x in key)
else:
# scalar callable may return tuple
key = com.apply_if_callable(key, self.obj)
@@ -2215,8 +2265,7 @@ def __setitem__(self, key, value):
if not isinstance(key, tuple):
key = self._tuplify(key)
if len(key) != self.obj.ndim:
- raise ValueError('Not enough indexers for scalar access '
- '(setting)!')
+ raise ValueError("Not enough indexers for scalar access " "(setting)!")
key = list(self._convert_key(key, is_setter=True))
key.append(value)
self.obj._set_value(*key, takeable=self._takeable)
@@ -2283,13 +2332,17 @@ def _convert_key(self, key, is_setter=False):
for ax, i in zip(self.obj.axes, key):
if ax.is_integer():
if not is_integer(i):
- raise ValueError("At based indexing on an integer index "
- "can only have integer indexers")
+ raise ValueError(
+ "At based indexing on an integer index "
+ "can only have integer indexers"
+ )
else:
if is_integer(i) and not ax.holds_integer():
- raise ValueError("At based indexing on an non-integer "
- "index can only have non-integer "
- "indexers")
+ raise ValueError(
+ "At based indexing on an non-integer "
+ "index can only have non-integer "
+ "indexers"
+ )
return key
@@ -2348,8 +2401,7 @@ def _convert_key(self, key, is_setter=False):
""" require integer args (and convert to label arguments) """
for a, i in zip(self.obj.axes, key):
if not is_integer(i):
- raise ValueError("iAt based indexing can only have integer "
- "indexers")
+ raise ValueError("iAt based indexing can only have integer " "indexers")
return key
@@ -2388,7 +2440,7 @@ def convert_to_index_sliceable(obj, key):
"""
idx = obj.index
if isinstance(key, slice):
- return idx._convert_slice_indexer(key, kind='getitem')
+ return idx._convert_slice_indexer(key, kind="getitem")
elif isinstance(key, str):
@@ -2440,9 +2492,11 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
result = result.reindex(index)
mask = isna(result._values)
if mask.any():
- raise IndexingError('Unalignable boolean Series provided as '
- 'indexer (index of the boolean Series and of '
- 'the indexed object do not match).')
+ raise IndexingError(
+ "Unalignable boolean Series provided as "
+ "indexer (index of the boolean Series and of "
+ "the indexed object do not match)."
+ )
result = result.astype(bool)._values
else:
if is_sparse(result):
@@ -2452,8 +2506,8 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
# GH26658
if len(result) != len(index):
raise IndexError(
- 'Item wrong length {} instead of {}.'.format(len(result),
- len(index)))
+ "Item wrong length {} instead of {}.".format(len(result), len(index))
+ )
return result
@@ -2488,18 +2542,24 @@ def check_setitem_lengths(indexer, value, values):
# boolean with truth values == len of the value is ok too
if isinstance(indexer, (np.ndarray, list)):
if is_list_like(value) and len(indexer) != len(value):
- if not (isinstance(indexer, np.ndarray) and
- indexer.dtype == np.bool_ and
- len(indexer[indexer]) == len(value)):
- raise ValueError("cannot set using a list-like indexer "
- "with a different length than the value")
+ if not (
+ isinstance(indexer, np.ndarray)
+ and indexer.dtype == np.bool_
+ and len(indexer[indexer]) == len(value)
+ ):
+ raise ValueError(
+ "cannot set using a list-like indexer "
+ "with a different length than the value"
+ )
# slice
elif isinstance(indexer, slice):
if is_list_like(value) and len(values):
if len(value) != length_of_indexer(indexer, values):
- raise ValueError("cannot set using a slice indexer with a "
- "different length than the value")
+ raise ValueError(
+ "cannot set using a slice indexer with a "
+ "different length than the value"
+ )
def convert_missing_indexer(indexer):
@@ -2511,7 +2571,7 @@ def convert_missing_indexer(indexer):
if isinstance(indexer, dict):
# a missing key (but not a tuple indexer)
- indexer = indexer['key']
+ indexer = indexer["key"]
if isinstance(indexer, bool):
raise KeyError("cannot use a single bool to index into setitem")
@@ -2526,8 +2586,7 @@ def convert_from_missing_indexer_tuple(indexer, axes):
"""
def get_indexer(_i, _idx):
- return (axes[_i].get_loc(_idx['key']) if isinstance(_idx, dict) else
- _idx)
+ return axes[_i].get_loc(_idx["key"]) if isinstance(_idx, dict) else _idx
return tuple(get_indexer(_i, _idx) for _i, _idx in enumerate(indexer))
@@ -2608,8 +2667,9 @@ def validate_indices(indices, n):
if len(indices):
min_idx = indices.min()
if min_idx < -1:
- msg = ("'indices' contains values less than allowed ({} < {})"
- .format(min_idx, -1))
+ msg = "'indices' contains values less than allowed ({} < {})".format(
+ min_idx, -1
+ )
raise ValueError(msg)
max_idx = indices.max()
@@ -2648,8 +2708,7 @@ def is_nested_tuple(tup, labels):
def is_list_like_indexer(key):
# allow a list_like, but exclude NamedTuples which can be indexers
- return is_list_like(key) and not (isinstance(key, tuple) and
- type(key) is not tuple)
+ return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple)
def is_label_like(key):
@@ -2658,8 +2717,11 @@ def is_label_like(key):
def need_slice(obj):
- return (obj.start is not None or obj.stop is not None or
- (obj.step is not None and obj.step != 1))
+ return (
+ obj.start is not None
+ or obj.stop is not None
+ or (obj.step is not None and obj.step != 1)
+ )
def maybe_droplevels(index, key):
@@ -2697,8 +2759,9 @@ def _non_reducing_slice(slice_):
def pred(part):
# true when slice does *not* reduce, False when part is a tuple,
# i.e. MultiIndex slice
- return ((isinstance(part, slice) or is_list_like(part))
- and not isinstance(part, tuple))
+ return (isinstance(part, slice) or is_list_like(part)) and not isinstance(
+ part, tuple
+ )
if not is_list_like(slice_):
if not isinstance(slice_, slice):
diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py
index bf46e5d1a74e4..8ac0df2fa4e0a 100644
--- a/pandas/core/internals/__init__.py
+++ b/pandas/core/internals/__init__.py
@@ -1,16 +1,28 @@
-
from .blocks import ( # noqa: F401
- Block, BoolBlock, CategoricalBlock, ComplexBlock, DatetimeBlock,
- DatetimeTZBlock, ExtensionBlock, FloatBlock, IntBlock, ObjectBlock,
- TimeDeltaBlock)
+ Block,
+ BoolBlock,
+ CategoricalBlock,
+ ComplexBlock,
+ DatetimeBlock,
+ DatetimeTZBlock,
+ ExtensionBlock,
+ FloatBlock,
+ IntBlock,
+ ObjectBlock,
+ TimeDeltaBlock,
+)
from .managers import ( # noqa: F401
- BlockManager, SingleBlockManager, create_block_manager_from_arrays,
- create_block_manager_from_blocks)
+ BlockManager,
+ SingleBlockManager,
+ create_block_manager_from_arrays,
+ create_block_manager_from_blocks,
+)
from .blocks import _safe_reshape # noqa: F401; io.packers
from .blocks import make_block # noqa: F401; io.pytables, io.packers
from .managers import ( # noqa: F401; reshape.concat, reshape.merge
_transform_index,
- concatenate_block_managers)
+ concatenate_block_managers,
+)
from .blocks import _block_shape # noqa:F401; io.pytables
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index a9b2c0491458c..34186b60de27c 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -13,27 +13,63 @@
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.cast import (
- astype_nansafe, find_common_type, infer_dtype_from,
- infer_dtype_from_scalar, maybe_convert_objects, maybe_downcast_to_dtype,
- maybe_infer_dtype_type, maybe_promote, maybe_upcast, soft_convert_objects)
+ astype_nansafe,
+ find_common_type,
+ infer_dtype_from,
+ infer_dtype_from_scalar,
+ maybe_convert_objects,
+ maybe_downcast_to_dtype,
+ maybe_infer_dtype_type,
+ maybe_promote,
+ maybe_upcast,
+ soft_convert_objects,
+)
from pandas.core.dtypes.common import (
- _NS_DTYPE, _TD_DTYPE, ensure_platform_int, is_bool_dtype, is_categorical,
- is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
- is_dtype_equal, is_extension_array_dtype, is_extension_type,
- is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype,
- is_list_like, is_numeric_v_string_like, is_object_dtype, is_period_dtype,
- is_re, is_re_compilable, is_sparse, is_timedelta64_dtype, pandas_dtype)
+ _NS_DTYPE,
+ _TD_DTYPE,
+ ensure_platform_int,
+ is_bool_dtype,
+ is_categorical,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_extension_type,
+ is_float_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_interval_dtype,
+ is_list_like,
+ is_numeric_v_string_like,
+ is_object_dtype,
+ is_period_dtype,
+ is_re,
+ is_re_compilable,
+ is_sparse,
+ is_timedelta64_dtype,
+ pandas_dtype,
+)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass,
- ABCPandasArray, ABCSeries)
-from pandas.core.dtypes.missing import (
- _isna_compat, array_equivalent, isna, notna)
+ ABCDataFrame,
+ ABCDatetimeIndex,
+ ABCExtensionArray,
+ ABCIndexClass,
+ ABCPandasArray,
+ ABCSeries,
+)
+from pandas.core.dtypes.missing import _isna_compat, array_equivalent, isna, notna
import pandas.core.algorithms as algos
from pandas.core.arrays import (
- Categorical, DatetimeArray, ExtensionArray, PandasDtype, TimedeltaArray)
+ Categorical,
+ DatetimeArray,
+ ExtensionArray,
+ PandasDtype,
+ TimedeltaArray,
+)
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.indexing import check_setitem_lengths
@@ -51,7 +87,8 @@ class Block(PandasObject):
Index-ignorant; let the container take care of that
"""
- __slots__ = ['_mgr_locs', 'values', 'ndim']
+
+ __slots__ = ["_mgr_locs", "values", "ndim"]
is_numeric = False
is_float = False
is_integer = False
@@ -67,7 +104,7 @@ class Block(PandasObject):
_can_consolidate = True
_verify_integrity = True
_validate_ndim = True
- _ftype = 'dense'
+ _ftype = "dense"
_concatenator = staticmethod(np.concatenate)
def __init__(self, values, placement, ndim=None):
@@ -75,11 +112,11 @@ def __init__(self, values, placement, ndim=None):
self.mgr_locs = placement
self.values = values
- if (self._validate_ndim and self.ndim and
- len(self.mgr_locs) != len(self.values)):
+ if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values):
raise ValueError(
- 'Wrong number of items passed {val}, placement implies '
- '{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs)))
+ "Wrong number of items passed {val}, placement implies "
+ "{mgr}".format(val=len(self.values), mgr=len(self.mgr_locs))
+ )
def _check_ndim(self, values, ndim):
"""
@@ -106,8 +143,7 @@ def _check_ndim(self, values, ndim):
ndim = values.ndim
if self._validate_ndim and values.ndim != ndim:
- msg = ("Wrong number of dimensions. values.ndim != ndim "
- "[{} != {}]")
+ msg = "Wrong number of dimensions. values.ndim != ndim " "[{} != {}]"
raise ValueError(msg.format(values.ndim, ndim))
return ndim
@@ -218,32 +254,38 @@ def make_block(self, values, placement=None):
return make_block(values, placement=placement, ndim=self.ndim)
- def make_block_same_class(self, values, placement=None, ndim=None,
- dtype=None):
+ def make_block_same_class(self, values, placement=None, ndim=None, dtype=None):
""" Wrap given values in a block of same type as self. """
if dtype is not None:
# issue 19431 fastparquet is passing this
- warnings.warn("dtype argument is deprecated, will be removed "
- "in a future release.", FutureWarning)
+ warnings.warn(
+ "dtype argument is deprecated, will be removed " "in a future release.",
+ FutureWarning,
+ )
if placement is None:
placement = self.mgr_locs
- return make_block(values, placement=placement, ndim=ndim,
- klass=self.__class__, dtype=dtype)
+ return make_block(
+ values, placement=placement, ndim=ndim, klass=self.__class__, dtype=dtype
+ )
def __repr__(self):
# don't want to print out all of the items here
name = pprint_thing(self.__class__.__name__)
if self._is_single_block:
- result = '{name}: {len} dtype: {dtype}'.format(
- name=name, len=len(self), dtype=self.dtype)
+ result = "{name}: {len} dtype: {dtype}".format(
+ name=name, len=len(self), dtype=self.dtype
+ )
else:
- shape = ' x '.join(pprint_thing(s) for s in self.shape)
- result = '{name}: {index}, {shape}, dtype: {dtype}'.format(
- name=name, index=pprint_thing(self.mgr_locs.indexer),
- shape=shape, dtype=self.dtype)
+ shape = " x ".join(pprint_thing(s) for s in self.shape)
+ result = "{name}: {index}, {shape}, dtype: {dtype}".format(
+ name=name,
+ index=pprint_thing(self.mgr_locs.indexer),
+ shape=shape,
+ dtype=self.dtype,
+ )
return result
@@ -292,7 +334,7 @@ def dtype(self):
@property
def ftype(self):
- if getattr(self.values, '_pandas_ftype', False):
+ if getattr(self.values, "_pandas_ftype", False):
dtype = self.dtype.subtype
else:
dtype = self.dtype
@@ -305,10 +347,12 @@ def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
"""
- values = self._concatenator([blk.values for blk in to_concat],
- axis=self.ndim - 1)
+ values = self._concatenator(
+ [blk.values for blk in to_concat], axis=self.ndim - 1
+ )
return self.make_block_same_class(
- values, placement=placement or slice(0, len(values), 1))
+ values, placement=placement or slice(0, len(values), 1)
+ )
def iget(self, i):
return self.values[i]
@@ -334,11 +378,10 @@ def apply(self, func, **kwargs):
""" apply the function to my values; return a block if we are not
one
"""
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = func(self.values, **kwargs)
if not isinstance(result, Block):
- result = self.make_block(values=_block_shape(result,
- ndim=self.ndim))
+ result = self.make_block(values=_block_shape(result, ndim=self.ndim))
return result
@@ -346,17 +389,18 @@ def fillna(self, value, limit=None, inplace=False, downcast=None):
""" fillna on the block with the value. If we fail, then convert to
ObjectBlock and try again
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
mask = isna(self.values)
if limit is not None:
if not is_integer(limit):
- raise ValueError('Limit must be an integer')
+ raise ValueError("Limit must be an integer")
if limit < 1:
- raise ValueError('Limit must be greater than 0')
+ raise ValueError("Limit must be greater than 0")
if self.ndim > 2:
- raise NotImplementedError("number of dimensions for 'fillna' "
- "is currently limited to 2")
+ raise NotImplementedError(
+ "number of dimensions for 'fillna' " "is currently limited to 2"
+ )
mask[mask.cumsum(self.ndim - 1) > limit] = False
if not self._can_hold_na:
@@ -371,8 +415,9 @@ def fillna(self, value, limit=None, inplace=False, downcast=None):
self._try_coerce_args(value)
blocks = self.putmask(mask, value, inplace=inplace)
- blocks = [b.make_block(values=self._try_coerce_result(b.values))
- for b in blocks]
+ blocks = [
+ b.make_block(values=self._try_coerce_result(b.values)) for b in blocks
+ ]
return self._maybe_downcast(blocks, downcast)
except (TypeError, ValueError):
@@ -387,10 +432,7 @@ def f(m, v, i):
# slice out our block
if i is not None:
block = block.getitem_block(slice(i, i + 1))
- return block.fillna(value,
- limit=limit,
- inplace=inplace,
- downcast=None)
+ return block.fillna(value, limit=limit, inplace=inplace, downcast=None)
return self.split_and_operate(mask, f, inplace)
@@ -424,8 +466,7 @@ def make_a_block(nv, ref_loc):
# Put back the dimension that was taken from it and make
# a block out of the result.
nv = _block_shape(nv, ndim=self.ndim)
- block = self.make_block(values=nv,
- placement=ref_loc)
+ block = self.make_block(values=nv, placement=ref_loc)
return block
# ndim == 1
@@ -481,7 +522,7 @@ def downcast(self, dtypes=None):
# try to cast all non-floats here
if dtypes is None:
- dtypes = 'infer'
+ dtypes = "infer"
nv = maybe_downcast_to_dtype(values, dtypes)
return self.make_block(nv)
@@ -490,16 +531,17 @@ def downcast(self, dtypes=None):
if dtypes is None:
return self
- if not (dtypes == 'infer' or isinstance(dtypes, dict)):
- raise ValueError("downcast must have a dictionary or 'infer' as "
- "its argument")
+ if not (dtypes == "infer" or isinstance(dtypes, dict)):
+ raise ValueError(
+ "downcast must have a dictionary or 'infer' as " "its argument"
+ )
# operate column-by-column
# this is expensive as it splits the blocks items-by-item
def f(m, v, i):
- if dtypes == 'infer':
- dtype = 'infer'
+ if dtypes == "infer":
+ dtype = "infer"
else:
raise AssertionError("dtypes as dict is not supported yet")
@@ -509,12 +551,10 @@ def f(m, v, i):
return self.split_and_operate(None, f, False)
- def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
- return self._astype(dtype, copy=copy, errors=errors, values=values,
- **kwargs)
+ def astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
+ return self._astype(dtype, copy=copy, errors=errors, values=values, **kwargs)
- def _astype(self, dtype, copy=False, errors='raise', values=None,
- **kwargs):
+ def _astype(self, dtype, copy=False, errors="raise", values=None, **kwargs):
"""Coerce to the new type
Parameters
@@ -530,31 +570,34 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
-------
Block
"""
- errors_legal_values = ('raise', 'ignore')
+ errors_legal_values = ("raise", "ignore")
if errors not in errors_legal_values:
- invalid_arg = ("Expected value of kwarg 'errors' to be one of {}. "
- "Supplied value is '{}'".format(
- list(errors_legal_values), errors))
+ invalid_arg = (
+ "Expected value of kwarg 'errors' to be one of {}. "
+ "Supplied value is '{}'".format(list(errors_legal_values), errors)
+ )
raise ValueError(invalid_arg)
- if (inspect.isclass(dtype) and
- issubclass(dtype, ExtensionDtype)):
- msg = ("Expected an instance of {}, but got the class instead. "
- "Try instantiating 'dtype'.".format(dtype.__name__))
+ if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype):
+ msg = (
+ "Expected an instance of {}, but got the class instead. "
+ "Try instantiating 'dtype'.".format(dtype.__name__)
+ )
raise TypeError(msg)
# may need to convert to categorical
if self.is_categorical_astype(dtype):
# deprecated 17636
- for deprecated_arg in ('categories', 'ordered'):
+ for deprecated_arg in ("categories", "ordered"):
if deprecated_arg in kwargs:
- raise ValueError('Got an unexpected argument: {}'.format(
- deprecated_arg))
+ raise ValueError(
+ "Got an unexpected argument: {}".format(deprecated_arg)
+ )
- categories = kwargs.get('categories', None)
- ordered = kwargs.get('ordered', None)
+ categories = kwargs.get("categories", None)
+ ordered = kwargs.get("ordered", None)
if com._any_not_none(categories, ordered):
dtype = CategoricalDtype(categories, ordered)
@@ -602,12 +645,11 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
values = values.reshape(self.shape)
except Exception: # noqa: E722
- if errors == 'raise':
+ if errors == "raise":
raise
newb = self.copy() if copy else self
else:
- newb = make_block(values, placement=self.mgr_locs,
- ndim=self.ndim)
+ newb = make_block(values, placement=self.mgr_locs, ndim=self.ndim)
if newb.is_numeric and self.is_numeric:
if newb.shape != self.shape:
@@ -615,9 +657,13 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
"cannot set astype for copy = [{copy}] for dtype "
"({dtype} [{shape}]) to different shape "
"({newb_dtype} [{newb_shape}])".format(
- copy=copy, dtype=self.dtype.name,
- shape=self.shape, newb_dtype=newb.dtype.name,
- newb_shape=newb.shape))
+ copy=copy,
+ dtype=self.dtype.name,
+ shape=self.shape,
+ newb_dtype=newb.dtype.name,
+ newb_shape=newb.shape,
+ )
+ )
return newb
def convert(self, copy=True, **kwargs):
@@ -647,7 +693,7 @@ def _try_cast_result(self, result, dtype=None):
pass
elif self.is_float and result.dtype == self.dtype:
# protect against a bool/object showing up here
- if isinstance(dtype, str) and dtype == 'infer':
+ if isinstance(dtype, str) and dtype == "infer":
return result
# This is only reached via Block.setitem, where dtype is always
@@ -678,9 +724,12 @@ def _try_coerce_args(self, other):
if np.any(notna(other)) and not self._can_hold_element(other):
# coercion issues
# let higher levels handle
- raise TypeError("cannot convert {} to an {}".format(
- type(other).__name__,
- type(self).__name__.lower().replace('Block', '')))
+ raise TypeError(
+ "cannot convert {} to an {}".format(
+ type(other).__name__,
+ type(self).__name__.lower().replace("Block", ""),
+ )
+ )
return other
@@ -693,8 +742,7 @@ def _try_coerce_and_cast_result(self, result, dtype=None):
result = self._try_cast_result(result, dtype=dtype)
return result
- def to_native_types(self, slicer=None, na_rep='nan', quoting=None,
- **kwargs):
+ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.get_values()
@@ -706,7 +754,7 @@ def to_native_types(self, slicer=None, na_rep='nan', quoting=None,
if not self.is_object and not quoting:
values = values.astype(str)
else:
- values = np.array(values, dtype='object')
+ values = np.array(values, dtype="object")
values[mask] = na_rep
return values
@@ -719,14 +767,15 @@ def copy(self, deep=True):
values = values.copy()
return self.make_block_same_class(values, ndim=self.ndim)
- def replace(self, to_replace, value, inplace=False, filter=None,
- regex=False, convert=True):
+ def replace(
+ self, to_replace, value, inplace=False, filter=None, regex=False, convert=True
+ ):
"""replace the to_replace value with value, possible to create new
blocks here this is just a call to putmask. regex is not used here.
It is used in ObjectBlocks. It is here for API compatibility.
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
original_to_replace = to_replace
# try to replace, if we raise an error, convert to ObjectBlock and
@@ -742,12 +791,14 @@ def replace(self, to_replace, value, inplace=False, filter=None,
# try again with a compatible block
block = self.astype(object)
- return block.replace(to_replace=original_to_replace,
- value=value,
- inplace=inplace,
- filter=filter,
- regex=regex,
- convert=convert)
+ return block.replace(
+ to_replace=original_to_replace,
+ value=value,
+ inplace=inplace,
+ filter=filter,
+ regex=regex,
+ convert=convert,
+ )
mask = missing.mask_missing(values, to_replace)
if filter is not None:
@@ -764,20 +815,23 @@ def replace(self, to_replace, value, inplace=False, filter=None,
# try again with a compatible block
block = self.astype(object)
- return block.replace(to_replace=original_to_replace,
- value=value,
- inplace=inplace,
- filter=filter,
- regex=regex,
- convert=convert)
+ return block.replace(
+ to_replace=original_to_replace,
+ value=value,
+ inplace=inplace,
+ filter=filter,
+ regex=regex,
+ convert=convert,
+ )
if convert:
- blocks = [b.convert(by_item=True, numeric=False,
- copy=not inplace) for b in blocks]
+ blocks = [
+ b.convert(by_item=True, numeric=False, copy=not inplace) for b in blocks
+ ]
return blocks
def _replace_single(self, *args, **kwargs):
""" no-op on a non-ObjectBlock """
- return self if kwargs['inplace'] else self.copy()
+ return self if kwargs["inplace"] else self.copy()
def setitem(self, indexer, value):
"""Set the value inplace, returning a a maybe different typed block.
@@ -809,17 +863,16 @@ def setitem(self, indexer, value):
value = self._try_coerce_args(value)
values = self._coerce_values(values)
# can keep its own dtype
- if hasattr(value, 'dtype') and is_dtype_equal(values.dtype,
- value.dtype):
+ if hasattr(value, "dtype") and is_dtype_equal(values.dtype, value.dtype):
dtype = self.dtype
else:
- dtype = 'infer'
+ dtype = "infer"
except (TypeError, ValueError):
# current dtype cannot store value, coerce to common dtype
find_dtype = False
- if hasattr(value, 'dtype'):
+ if hasattr(value, "dtype"):
dtype = value.dtype
find_dtype = True
@@ -828,11 +881,10 @@ def setitem(self, indexer, value):
# NaN promotion is handled in latter path
dtype = False
else:
- dtype, _ = infer_dtype_from_scalar(value,
- pandas_dtype=True)
+ dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True)
find_dtype = True
else:
- dtype = 'infer'
+ dtype = "infer"
if find_dtype:
dtype = find_common_type([values.dtype, dtype])
@@ -860,8 +912,9 @@ def _is_scalar_indexer(indexer):
if arr_value.ndim == 1:
if not isinstance(indexer, tuple):
indexer = tuple([indexer])
- return any(isinstance(idx, np.ndarray) and len(idx) == 0
- for idx in indexer)
+ return any(
+ isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer
+ )
return False
def _is_empty_indexer(indexer):
@@ -872,8 +925,9 @@ def _is_empty_indexer(indexer):
if arr_value.ndim == 1:
if not isinstance(indexer, tuple):
indexer = tuple([indexer])
- return any(isinstance(idx, np.ndarray) and len(idx) == 0
- for idx in indexer)
+ return any(
+ isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer
+ )
return False
# empty indexers
@@ -889,9 +943,11 @@ def _is_empty_indexer(indexer):
# if we are an exact match (ex-broadcasting),
# then use the resultant dtype
- elif (len(arr_value.shape) and
- arr_value.shape[0] == values.shape[0] and
- np.prod(arr_value.shape) == np.prod(values.shape)):
+ elif (
+ len(arr_value.shape)
+ and arr_value.shape[0] == values.shape[0]
+ and np.prod(arr_value.shape) == np.prod(values.shape)
+ ):
values[indexer] = value
try:
values = values.astype(arr_value.dtype)
@@ -907,8 +963,7 @@ def _is_empty_indexer(indexer):
block = self.make_block(transf(values))
return block
- def putmask(self, mask, new, align=True, inplace=False, axis=0,
- transpose=False):
+ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False):
""" putmask the data to the block; it is possible that we may create a
new dtype of block
@@ -931,8 +986,8 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
new_values = self.values if inplace else self.values.copy()
- new = getattr(new, 'values', new)
- mask = getattr(mask, 'values', mask)
+ new = getattr(new, "values", new)
+ mask = getattr(mask, "values", mask)
# if we are passed a scalar None, convert it here
if not is_list_like(new) and isna(new) and not self.is_object:
@@ -946,10 +1001,9 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
# If the default repeat behavior in np.putmask would go in the
# wrong direction, then explicitly repeat and reshape new instead
- if getattr(new, 'ndim', 0) >= 1:
+ if getattr(new, "ndim", 0) >= 1:
if self.ndim - 1 == new.ndim and axis == 1:
- new = np.repeat(
- new, new_values.shape[-1]).reshape(self.shape)
+ new = np.repeat(new, new_values.shape[-1]).reshape(self.shape)
new = new.astype(new_values.dtype)
# we require exact matches between the len of the
@@ -959,15 +1013,18 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
#
# TODO: this prob needs some better checking
# for 2D cases
- if ((is_list_like(new) and
- np.any(mask[mask]) and
- getattr(new, 'ndim', 1) == 1)):
-
- if not (mask.shape[-1] == len(new) or
- mask[mask].shape[-1] == len(new) or
- len(new) == 1):
- raise ValueError("cannot assign mismatch "
- "length to masked array")
+ if (
+ is_list_like(new)
+ and np.any(mask[mask])
+ and getattr(new, "ndim", 1) == 1
+ ):
+
+ if not (
+ mask.shape[-1] == len(new)
+ or mask[mask].shape[-1] == len(new)
+ or len(new) == 1
+ ):
+ raise ValueError("cannot assign mismatch " "length to masked array")
np.putmask(new_values, mask, new)
@@ -980,7 +1037,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
axis = new_values.ndim - axis - 1
# Pseudo-broadcast
- if getattr(new, 'ndim', 0) >= 1:
+ if getattr(new, "ndim", 0) >= 1:
if self.ndim - 1 == new.ndim:
new_shape = list(new.shape)
new_shape.insert(axis, 1)
@@ -1038,40 +1095,47 @@ def coerce_to_target_dtype(self, other):
# we don't upcast to bool
return self.astype(object)
- elif ((self.is_float or self.is_complex) and
- (is_integer_dtype(dtype) or is_float_dtype(dtype))):
+ elif (self.is_float or self.is_complex) and (
+ is_integer_dtype(dtype) or is_float_dtype(dtype)
+ ):
# don't coerce float/complex to int
return self
- elif (self.is_datetime or
- is_datetime64_dtype(dtype) or
- is_datetime64tz_dtype(dtype)):
+ elif (
+ self.is_datetime
+ or is_datetime64_dtype(dtype)
+ or is_datetime64tz_dtype(dtype)
+ ):
# not a datetime
- if not ((is_datetime64_dtype(dtype) or
- is_datetime64tz_dtype(dtype)) and self.is_datetime):
+ if not (
+ (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype))
+ and self.is_datetime
+ ):
return self.astype(object)
# don't upcast timezone with different timezone or no timezone
- mytz = getattr(self.dtype, 'tz', None)
- othertz = getattr(dtype, 'tz', None)
+ mytz = getattr(self.dtype, "tz", None)
+ othertz = getattr(dtype, "tz", None)
if str(mytz) != str(othertz):
return self.astype(object)
- raise AssertionError("possible recursion in "
- "coerce_to_target_dtype: {} {}".format(
- self, other))
+ raise AssertionError(
+ "possible recursion in "
+ "coerce_to_target_dtype: {} {}".format(self, other)
+ )
- elif (self.is_timedelta or is_timedelta64_dtype(dtype)):
+ elif self.is_timedelta or is_timedelta64_dtype(dtype):
# not a timedelta
if not (is_timedelta64_dtype(dtype) and self.is_timedelta):
return self.astype(object)
- raise AssertionError("possible recursion in "
- "coerce_to_target_dtype: {} {}".format(
- self, other))
+ raise AssertionError(
+ "possible recursion in "
+ "coerce_to_target_dtype: {} {}".format(self, other)
+ )
try:
return self.astype(dtype)
@@ -1080,12 +1144,23 @@ def coerce_to_target_dtype(self, other):
return self.astype(object)
- def interpolate(self, method='pad', axis=0, index=None, values=None,
- inplace=False, limit=None, limit_direction='forward',
- limit_area=None, fill_value=None, coerce=False,
- downcast=None, **kwargs):
-
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ def interpolate(
+ self,
+ method="pad",
+ axis=0,
+ index=None,
+ values=None,
+ inplace=False,
+ limit=None,
+ limit_direction="forward",
+ limit_area=None,
+ fill_value=None,
+ coerce=False,
+ downcast=None,
+ **kwargs
+ ):
+
+ inplace = validate_bool_kwarg(inplace, "inplace")
def check_int_bool(self, inplace):
# Only FloatBlocks will contain NaNs.
@@ -1106,30 +1181,48 @@ def check_int_bool(self, inplace):
r = check_int_bool(self, inplace)
if r is not None:
return r
- return self._interpolate_with_fill(method=m, axis=axis,
- inplace=inplace, limit=limit,
- fill_value=fill_value,
- coerce=coerce,
- downcast=downcast)
+ return self._interpolate_with_fill(
+ method=m,
+ axis=axis,
+ inplace=inplace,
+ limit=limit,
+ fill_value=fill_value,
+ coerce=coerce,
+ downcast=downcast,
+ )
# validate the interp method
m = missing.clean_interp_method(method, **kwargs)
r = check_int_bool(self, inplace)
if r is not None:
return r
- return self._interpolate(method=m, index=index, values=values,
- axis=axis, limit=limit,
- limit_direction=limit_direction,
- limit_area=limit_area,
- fill_value=fill_value, inplace=inplace,
- downcast=downcast, **kwargs)
-
- def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
- limit=None, fill_value=None, coerce=False,
- downcast=None):
+ return self._interpolate(
+ method=m,
+ index=index,
+ values=values,
+ axis=axis,
+ limit=limit,
+ limit_direction=limit_direction,
+ limit_area=limit_area,
+ fill_value=fill_value,
+ inplace=inplace,
+ downcast=downcast,
+ **kwargs
+ )
+
+ def _interpolate_with_fill(
+ self,
+ method="pad",
+ axis=0,
+ inplace=False,
+ limit=None,
+ fill_value=None,
+ coerce=False,
+ downcast=None,
+ ):
""" fillna but using the interpolate machinery """
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
# if we are coercing, then don't force the conversion
# if the block can't hold the type
@@ -1143,21 +1236,36 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
values = self.values if inplace else self.values.copy()
values = self._coerce_values(values)
fill_value = self._try_coerce_args(fill_value)
- values = missing.interpolate_2d(values, method=method, axis=axis,
- limit=limit, fill_value=fill_value,
- dtype=self.dtype)
+ values = missing.interpolate_2d(
+ values,
+ method=method,
+ axis=axis,
+ limit=limit,
+ fill_value=fill_value,
+ dtype=self.dtype,
+ )
values = self._try_coerce_result(values)
blocks = [self.make_block_same_class(values, ndim=self.ndim)]
return self._maybe_downcast(blocks, downcast)
- def _interpolate(self, method=None, index=None, values=None,
- fill_value=None, axis=0, limit=None,
- limit_direction='forward', limit_area=None,
- inplace=False, downcast=None, **kwargs):
+ def _interpolate(
+ self,
+ method=None,
+ index=None,
+ values=None,
+ fill_value=None,
+ axis=0,
+ limit=None,
+ limit_direction="forward",
+ limit_area=None,
+ inplace=False,
+ downcast=None,
+ **kwargs
+ ):
""" interpolate using scipy wrappers """
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
data = self.values if inplace else self.values.copy()
# only deal with floats
@@ -1169,10 +1277,12 @@ def _interpolate(self, method=None, index=None, values=None,
if fill_value is None:
fill_value = self.fill_value
- if method in ('krogh', 'piecewise_polynomial', 'pchip'):
+ if method in ("krogh", "piecewise_polynomial", "pchip"):
if not index.is_monotonic:
- raise ValueError("{0} interpolation requires that the "
- "index be monotonic.".format(method))
+ raise ValueError(
+ "{0} interpolation requires that the "
+ "index be monotonic.".format(method)
+ )
# process 1-d slices in the axis direction
def func(x):
@@ -1180,11 +1290,17 @@ def func(x):
# process a 1-d slice, returning it
# should the axis argument be handled below in apply_along_axis?
# i.e. not an arg to missing.interpolate_1d
- return missing.interpolate_1d(index, x, method=method, limit=limit,
- limit_direction=limit_direction,
- limit_area=limit_area,
- fill_value=fill_value,
- bounds_error=False, **kwargs)
+ return missing.interpolate_1d(
+ index,
+ x,
+ method=method,
+ limit=limit,
+ limit_direction=limit_direction,
+ limit_area=limit_area,
+ fill_value=fill_value,
+ bounds_error=False,
+ **kwargs
+ )
# interp each column independently
interp_values = np.apply_along_axis(func, axis, data)
@@ -1206,12 +1322,14 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None):
if fill_tuple is None:
fill_value = self.fill_value
- new_values = algos.take_nd(values, indexer, axis=axis,
- allow_fill=False, fill_value=fill_value)
+ new_values = algos.take_nd(
+ values, indexer, axis=axis, allow_fill=False, fill_value=fill_value
+ )
else:
fill_value = fill_tuple[0]
- new_values = algos.take_nd(values, indexer, axis=axis,
- allow_fill=True, fill_value=fill_value)
+ new_values = algos.take_nd(
+ values, indexer, axis=axis, allow_fill=True, fill_value=fill_value
+ )
if new_mgr_locs is None:
if axis == 0:
@@ -1247,8 +1365,7 @@ def shift(self, periods, axis=0, fill_value=None):
axis = new_values.ndim - axis - 1
if np.prod(new_values.shape):
- new_values = np.roll(new_values, ensure_platform_int(periods),
- axis=axis)
+ new_values = np.roll(new_values, ensure_platform_int(periods), axis=axis)
axis_indexer = [slice(None)] * self.ndim
if periods > 0:
@@ -1263,8 +1380,16 @@ def shift(self, periods, axis=0, fill_value=None):
return [self.make_block(new_values)]
- def where(self, other, cond, align=True, errors='raise',
- try_cast=False, axis=0, transpose=False):
+ def where(
+ self,
+ other,
+ cond,
+ align=True,
+ errors="raise",
+ try_cast=False,
+ axis=0,
+ transpose=False,
+ ):
"""
evaluate the block; return result block(s) from the result
@@ -1286,27 +1411,27 @@ def where(self, other, cond, align=True, errors='raise',
a new block(s), the result of the func
"""
import pandas.core.computation.expressions as expressions
- assert errors in ['raise', 'ignore']
+
+ assert errors in ["raise", "ignore"]
values = self.values
orig_other = other
if transpose:
values = values.T
- other = getattr(other, '_values', getattr(other, 'values', other))
- cond = getattr(cond, 'values', cond)
+ other = getattr(other, "_values", getattr(other, "values", other))
+ cond = getattr(cond, "values", cond)
# If the default broadcasting would go in the wrong direction, then
# explicitly reshape other instead
- if getattr(other, 'ndim', 0) >= 1:
+ if getattr(other, "ndim", 0) >= 1:
if values.ndim - 1 == other.ndim and axis == 1:
- other = other.reshape(tuple(other.shape + (1, )))
+ other = other.reshape(tuple(other.shape + (1,)))
elif transpose and values.ndim == self.ndim - 1:
cond = cond.T
- if not hasattr(cond, 'shape'):
- raise ValueError("where must have a condition that is ndarray "
- "like")
+ if not hasattr(cond, "shape"):
+ raise ValueError("where must have a condition that is ndarray " "like")
# our where function
def func(cond, values, other):
@@ -1316,13 +1441,14 @@ def func(cond, values, other):
fastres = expressions.where(cond, values, other)
return self._try_coerce_result(fastres)
except Exception as detail:
- if errors == 'raise':
+ if errors == "raise":
raise TypeError(
- 'Could not operate [{other!r}] with block values '
- '[{detail!s}]'.format(other=other, detail=detail))
+ "Could not operate [{other!r}] with block values "
+ "[{detail!s}]".format(other=other, detail=detail)
+ )
else:
# return the values
- result = np.empty(values.shape, dtype='float64')
+ result = np.empty(values.shape, dtype="float64")
result.fill(np.nan)
return result
@@ -1339,11 +1465,16 @@ def func(cond, values, other):
# we cannot coerce, return a compat dtype
# we are explicitly ignoring errors
block = self.coerce_to_target_dtype(other)
- blocks = block.where(orig_other, cond, align=align,
- errors=errors,
- try_cast=try_cast, axis=axis,
- transpose=transpose)
- return self._maybe_downcast(blocks, 'infer')
+ blocks = block.where(
+ orig_other,
+ cond,
+ align=align,
+ errors=errors,
+ try_cast=try_cast,
+ axis=axis,
+ transpose=transpose,
+ )
+ return self._maybe_downcast(blocks, "infer")
if self._can_hold_na or self.ndim == 1:
@@ -1359,8 +1490,7 @@ def func(cond, values, other):
# might need to separate out blocks
axis = cond.ndim - 1
cond = cond.swapaxes(axis, 0)
- mask = np.array([cond[i].all() for i in range(cond.shape[0])],
- dtype=bool)
+ mask = np.array([cond[i].all() for i in range(cond.shape[0])], dtype=bool)
result_blocks = []
for m in [mask, ~mask]:
@@ -1410,7 +1540,7 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
blocks = [make_block(new_values, placement=new_placement)]
return blocks, mask
- def quantile(self, qs, interpolation='linear', axis=0):
+ def quantile(self, qs, interpolation="linear", axis=0):
"""
compute the quantiles of the
@@ -1450,18 +1580,23 @@ def quantile(self, qs, interpolation='linear', axis=0):
if is_empty:
# create the array of na_values
# 2d len(values) * len(qs)
- result = np.repeat(np.array([self.fill_value] * len(qs)),
- len(values)).reshape(len(values),
- len(qs))
+ result = np.repeat(
+ np.array([self.fill_value] * len(qs)), len(values)
+ ).reshape(len(values), len(qs))
else:
# asarray needed for Sparse, see GH#24600
# Note: we use self.values below instead of values because the
# `asi8` conversion above will behave differently under `isna`
mask = np.asarray(isna(self.values))
- result = nanpercentile(values, np.array(qs) * 100,
- axis=axis, na_value=self.fill_value,
- mask=mask, ndim=self.ndim,
- interpolation=interpolation)
+ result = nanpercentile(
+ values,
+ np.array(qs) * 100,
+ axis=axis,
+ na_value=self.fill_value,
+ mask=mask,
+ ndim=self.ndim,
+ interpolation=interpolation,
+ )
result = np.array(result, copy=False)
result = result.T
@@ -1472,14 +1607,13 @@ def quantile(self, qs, interpolation='linear', axis=0):
result = result[..., 0]
result = lib.item_from_zerodim(result)
- ndim = getattr(result, 'ndim', None) or 0
+ ndim = getattr(result, "ndim", None) or 0
result = self._try_coerce_result(result)
- return make_block(result,
- placement=np.arange(len(result)),
- ndim=ndim)
+ return make_block(result, placement=np.arange(len(result)), ndim=ndim)
- def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
- convert=False, mask=None):
+ def _replace_coerce(
+ self, to_replace, value, inplace=True, regex=False, convert=False, mask=None
+ ):
"""
Replace value corresponding to the given boolean array with another
value.
@@ -1509,15 +1643,20 @@ def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
self = self.coerce_to_target_dtype(value)
return self.putmask(mask, value, inplace=inplace)
else:
- return self._replace_single(to_replace, value, inplace=inplace,
- regex=regex,
- convert=convert,
- mask=mask)
+ return self._replace_single(
+ to_replace,
+ value,
+ inplace=inplace,
+ regex=regex,
+ convert=convert,
+ mask=mask,
+ )
return self
class NonConsolidatableMixIn:
""" hold methods for the nonconsolidatable blocks """
+
_can_consolidate = False
_verify_integrity = False
_validate_ndim = False
@@ -1546,7 +1685,7 @@ def __init__(self, values, placement, ndim=None):
@property
def shape(self):
if self.ndim == 1:
- return (len(self.values)),
+ return ((len(self.values)),)
return (len(self.mgr_locs), len(self.values))
def iget(self, col):
@@ -1572,8 +1711,7 @@ def set(self, locs, values, check=False):
assert locs.tolist() == [0]
self.values = values
- def putmask(self, mask, new, align=True, inplace=False, axis=0,
- transpose=False):
+ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False):
"""
putmask the data to the block; we must be a single block and not
generate other blocks
@@ -1591,7 +1729,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
-------
a new block, the result of the putmask
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
# use block's copy logic.
# .values may be an Index which does shallow copy by default
@@ -1654,6 +1792,7 @@ class ExtensionBlock(NonConsolidatableMixIn, Block):
ExtensionArrays are limited to 1-D.
"""
+
is_extension = True
def __init__(self, values, placement, ndim=None):
@@ -1754,8 +1893,7 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):
# axis doesn't matter; we are really a single-dim object
# but are passed the axis depending on the calling routing
# if its REALLY axis 0, then this will be a reindex and not a take
- new_values = self.values.take(indexer, fill_value=fill_value,
- allow_fill=True)
+ new_values = self.values.take(indexer, fill_value=fill_value, allow_fill=True)
if self.ndim == 1 and new_mgr_locs is None:
new_mgr_locs = [0]
@@ -1778,8 +1916,7 @@ def _slice(self, slicer):
if isinstance(slicer, tuple) and len(slicer) == 2:
if not com.is_null_slice(slicer[0]):
- raise AssertionError("invalid slicing for a 1-ndim "
- "categorical")
+ raise AssertionError("invalid slicing for a 1-ndim " "categorical")
slicer = slicer[1]
return self.values[slicer]
@@ -1798,8 +1935,7 @@ def _try_cast_result(self, result, dtype=None):
"""
try:
- result = self._holder._from_sequence(
- result.ravel(), dtype=dtype)
+ result = self._holder._from_sequence(result.ravel(), dtype=dtype)
except Exception:
pass
@@ -1809,7 +1945,7 @@ def formatting_values(self):
# Deprecating the ability to override _formatting_values.
# Do the warning here, it's only user in pandas, since we
# have to check if the subclass overrode it.
- fv = getattr(type(self.values), '_formatting_values', None)
+ fv = getattr(type(self.values), "_formatting_values", None)
if fv and fv != ExtensionArray._formatting_values:
msg = (
"'ExtensionArray._formatting_values' is deprecated. "
@@ -1824,32 +1960,35 @@ def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
"""
- values = self._holder._concat_same_type(
- [blk.values for blk in to_concat])
+ values = self._holder._concat_same_type([blk.values for blk in to_concat])
placement = placement or slice(0, len(values), 1)
- return self.make_block_same_class(values, ndim=self.ndim,
- placement=placement)
+ return self.make_block_same_class(values, ndim=self.ndim, placement=placement)
def fillna(self, value, limit=None, inplace=False, downcast=None):
values = self.values if inplace else self.values.copy()
values = values.fillna(value=value, limit=limit)
- return [self.make_block_same_class(values=values,
- placement=self.mgr_locs,
- ndim=self.ndim)]
+ return [
+ self.make_block_same_class(
+ values=values, placement=self.mgr_locs, ndim=self.ndim
+ )
+ ]
- def interpolate(self, method='pad', axis=0, inplace=False, limit=None,
- fill_value=None, **kwargs):
+ def interpolate(
+ self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs
+ ):
values = self.values if inplace else self.values.copy()
return self.make_block_same_class(
- values=values.fillna(value=fill_value, method=method,
- limit=limit),
- placement=self.mgr_locs)
+ values=values.fillna(value=fill_value, method=method, limit=limit),
+ placement=self.mgr_locs,
+ )
- def shift(self,
- periods: int,
- axis: libinternals.BlockPlacement = 0,
- fill_value: Any = None) -> List['ExtensionBlock']:
+ def shift(
+ self,
+ periods: int,
+ axis: libinternals.BlockPlacement = 0,
+ fill_value: Any = None,
+ ) -> List["ExtensionBlock"]:
"""
Shift the block by `periods`.
@@ -1859,11 +1998,21 @@ def shift(self,
return [
self.make_block_same_class(
self.values.shift(periods=periods, fill_value=fill_value),
- placement=self.mgr_locs, ndim=self.ndim)
+ placement=self.mgr_locs,
+ ndim=self.ndim,
+ )
]
- def where(self, other, cond, align=True, errors='raise',
- try_cast=False, axis=0, transpose=False):
+ def where(
+ self,
+ other,
+ cond,
+ align=True,
+ errors="raise",
+ try_cast=False,
+ axis=0,
+ transpose=False,
+ ):
if isinstance(other, ABCDataFrame):
# ExtensionArrays are 1-D, so if we get here then
# `other` should be a DataFrame with a single column.
@@ -1904,15 +2053,14 @@ def where(self, other, cond, align=True, errors='raise',
# TypeError for SparseArray, which implements just to raise
# a TypeError
result = self._holder._from_sequence(
- np.where(cond, self.values, other),
- dtype=dtype,
+ np.where(cond, self.values, other), dtype=dtype
)
return self.make_block_same_class(result, placement=self.mgr_locs)
@property
def _ftype(self):
- return getattr(self.values, '_pandas_ftype', Block._ftype)
+ return getattr(self.values, "_pandas_ftype", Block._ftype)
def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
# ExtensionArray-safe unstack.
@@ -1931,9 +2079,9 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value):
blocks = [
self.make_block_same_class(
- self.values.take(indices, allow_fill=True,
- fill_value=fill_value),
- [place])
+ self.values.take(indices, allow_fill=True, fill_value=fill_value),
+ [place],
+ )
for indices, place in zip(new_values.T, new_placement)
]
return blocks, mask
@@ -1974,16 +2122,25 @@ class FloatBlock(FloatOrComplexBlock):
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
- return (issubclass(tipo.type, (np.floating, np.integer)) and
- not issubclass(tipo.type, (np.datetime64, np.timedelta64)))
- return (
- isinstance(
- element, (float, int, np.floating, np.int_)) and
- not isinstance(element, (bool, np.bool_, datetime, timedelta,
- np.datetime64, np.timedelta64)))
+ return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass(
+ tipo.type, (np.datetime64, np.timedelta64)
+ )
+ return isinstance(
+ element, (float, int, np.floating, np.int_)
+ ) and not isinstance(
+ element,
+ (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64),
+ )
- def to_native_types(self, slicer=None, na_rep='', float_format=None,
- decimal='.', quoting=None, **kwargs):
+ def to_native_types(
+ self,
+ slicer=None,
+ na_rep="",
+ float_format=None,
+ decimal=".",
+ quoting=None,
+ **kwargs
+ ):
""" convert to our native types format, slicing if desired """
values = self.values
@@ -1993,29 +2150,33 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None,
# see gh-13418: no special formatting is desired at the
# output (important for appropriate 'quoting' behaviour),
# so do not pass it through the FloatArrayFormatter
- if float_format is None and decimal == '.':
+ if float_format is None and decimal == ".":
mask = isna(values)
if not quoting:
values = values.astype(str)
else:
- values = np.array(values, dtype='object')
+ values = np.array(values, dtype="object")
values[mask] = na_rep
return values
from pandas.io.formats.format import FloatArrayFormatter
- formatter = FloatArrayFormatter(values, na_rep=na_rep,
- float_format=float_format,
- decimal=decimal, quoting=quoting,
- fixed_width=False)
+
+ formatter = FloatArrayFormatter(
+ values,
+ na_rep=na_rep,
+ float_format=float_format,
+ decimal=decimal,
+ quoting=quoting,
+ fixed_width=False,
+ )
return formatter.get_result_as_array()
def should_store(self, value):
# when inserting a column should not coerce integers to floats
# unnecessarily
- return (issubclass(value.dtype.type, np.floating) and
- value.dtype == self.dtype)
+ return issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype
class ComplexBlock(FloatOrComplexBlock):
@@ -2025,13 +2186,10 @@ class ComplexBlock(FloatOrComplexBlock):
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
- return issubclass(tipo.type,
- (np.floating, np.integer, np.complexfloating))
- return (
- isinstance(
- element,
- (float, int, complex, np.float_, np.int_)) and
- not isinstance(element, (bool, np.bool_)))
+ return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating))
+ return isinstance(
+ element, (float, int, complex, np.float_, np.int_)
+ ) and not isinstance(element, (bool, np.bool_))
def should_store(self, value):
return issubclass(value.dtype.type, np.complexfloating)
@@ -2045,10 +2203,11 @@ class IntBlock(NumericBlock):
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
- return (issubclass(tipo.type, np.integer) and
- not issubclass(tipo.type, (np.datetime64,
- np.timedelta64)) and
- self.dtype.itemsize >= tipo.itemsize)
+ return (
+ issubclass(tipo.type, np.integer)
+ and not issubclass(tipo.type, (np.datetime64, np.timedelta64))
+ and self.dtype.itemsize >= tipo.itemsize
+ )
return is_integer(element)
def should_store(self, value):
@@ -2123,8 +2282,8 @@ def _astype(self, dtype, **kwargs):
# if we are passed a datetime64[ns, tz]
if is_datetime64tz_dtype(dtype):
values = self.values
- if getattr(values, 'tz', None) is None:
- values = DatetimeArray(values).tz_localize('UTC')
+ if getattr(values, "tz", None) is None:
+ values = DatetimeArray(values).tz_localize("UTC")
values = values.tz_convert(dtype.tz)
return self.make_block(values)
@@ -2135,11 +2294,10 @@ def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return tipo == _NS_DTYPE or tipo == np.int64
- return (is_integer(element) or isinstance(element, datetime) or
- isna(element))
+ return is_integer(element) or isinstance(element, datetime) or isna(element)
def _coerce_values(self, values):
- return values.view('i8')
+ return values.view("i8")
def _try_coerce_args(self, other):
"""
@@ -2163,12 +2321,13 @@ def _try_coerce_args(self, other):
other = tslibs.iNaT
elif isinstance(other, (datetime, np.datetime64, date)):
other = self._box_func(other)
- if getattr(other, 'tz') is not None:
- raise TypeError("cannot coerce a Timestamp with a tz on a "
- "naive Block")
- other = other.asm8.view('i8')
- elif hasattr(other, 'dtype') and is_datetime64_dtype(other):
- other = other.astype('i8', copy=False).view('i8')
+ if getattr(other, "tz") is not None:
+ raise TypeError(
+ "cannot coerce a Timestamp with a tz on a " "naive Block"
+ )
+ other = other.asm8.view("i8")
+ elif hasattr(other, "dtype") and is_datetime64_dtype(other):
+ other = other.astype("i8", copy=False).view("i8")
else:
# coercion issues
# let higher levels handle
@@ -2179,8 +2338,8 @@ def _try_coerce_args(self, other):
def _try_coerce_result(self, result):
""" reverse of try_coerce_args """
if isinstance(result, np.ndarray):
- if result.dtype.kind in ['i', 'f']:
- result = result.astype('M8[ns]')
+ if result.dtype.kind in ["i", "f"]:
+ result = result.astype("M8[ns]")
elif isinstance(result, (np.integer, np.float, np.datetime64)):
result = self._box_func(result)
@@ -2190,29 +2349,36 @@ def _try_coerce_result(self, result):
def _box_func(self):
return tslibs.Timestamp
- def to_native_types(self, slicer=None, na_rep=None, date_format=None,
- quoting=None, **kwargs):
+ def to_native_types(
+ self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs
+ ):
""" convert to our native types format, slicing if desired """
values = self.values
- i8values = self.values.view('i8')
+ i8values = self.values.view("i8")
if slicer is not None:
values = values[..., slicer]
i8values = i8values[..., slicer]
from pandas.io.formats.format import _get_format_datetime64_from_values
+
fmt = _get_format_datetime64_from_values(values, date_format)
result = tslib.format_array_from_datetime(
- i8values.ravel(), tz=getattr(self.values, 'tz', None),
- format=fmt, na_rep=na_rep).reshape(i8values.shape)
+ i8values.ravel(),
+ tz=getattr(self.values, "tz", None),
+ format=fmt,
+ na_rep=na_rep,
+ ).reshape(i8values.shape)
return np.atleast_2d(result)
def should_store(self, value):
- return (issubclass(value.dtype.type, np.datetime64) and
- not is_datetime64tz_dtype(value) and
- not is_extension_array_dtype(value))
+ return (
+ issubclass(value.dtype.type, np.datetime64)
+ and not is_datetime64tz_dtype(value)
+ and not is_extension_array_dtype(value)
+ )
def set(self, locs, values):
"""
@@ -2227,11 +2393,12 @@ def set(self, locs, values):
self.values[locs] = values
def external_values(self):
- return np.asarray(self.values.astype('datetime64[ns]', copy=False))
+ return np.asarray(self.values.astype("datetime64[ns]", copy=False))
class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
""" implement a datetime64 block with a tz attribute """
+
__slots__ = ()
is_datetimetz = True
is_extension = True
@@ -2350,7 +2517,7 @@ def _try_coerce_args(self, other):
other = _block_shape(other.asi8, ndim=self.ndim)
elif isinstance(other, (np.datetime64, datetime, date)):
other = tslibs.Timestamp(other)
- tz = getattr(other, 'tz', None)
+ tz = getattr(other, "tz", None)
# test we can have an equal time zone
if tz is None or str(tz) != str(self.values.tz):
@@ -2364,8 +2531,8 @@ def _try_coerce_args(self, other):
def _try_coerce_result(self, result):
""" reverse of try_coerce_args """
if isinstance(result, np.ndarray):
- if result.dtype.kind in ['i', 'f']:
- result = result.astype('M8[ns]')
+ if result.dtype.kind in ["i", "f"]:
+ result = result.astype("M8[ns]")
elif isinstance(result, (np.integer, np.float, np.datetime64)):
result = self._box_func(result)
@@ -2376,8 +2543,9 @@ def _try_coerce_result(self, result):
if result.ndim > 1:
result = result.reshape(np.prod(result.shape))
# GH#24096 new values invalidates a frequency
- result = self._holder._simple_new(result, freq=None,
- dtype=self.values.dtype)
+ result = self._holder._simple_new(
+ result, freq=None, dtype=self.values.dtype
+ )
return result
@@ -2410,7 +2578,7 @@ def diff(self, n, axis=0):
# Reshape the new_values like how algos.diff does for timedelta data
new_values = new_values.reshape(1, len(new_values))
- new_values = new_values.astype('timedelta64[ns]')
+ new_values = new_values.astype("timedelta64[ns]")
return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)]
def concat_same_type(self, to_concat, placement=None):
@@ -2445,16 +2613,16 @@ def setitem(self, indexer, value):
try:
return super().setitem(indexer, value)
except (ValueError, TypeError):
- newb = make_block(self.values.astype(object),
- placement=self.mgr_locs,
- klass=ObjectBlock)
+ newb = make_block(
+ self.values.astype(object), placement=self.mgr_locs, klass=ObjectBlock
+ )
return newb.setitem(indexer, value)
def equals(self, other):
# override for significant performance improvement
if self.dtype != other.dtype or self.shape != other.shape:
return False
- return (self.values.view('i8') == other.values.view('i8')).all()
+ return (self.values.view("i8") == other.values.view("i8")).all()
class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock):
@@ -2477,14 +2645,15 @@ def _holder(self):
@property
def _box_func(self):
- return lambda x: Timedelta(x, unit='ns')
+ return lambda x: Timedelta(x, unit="ns")
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return issubclass(tipo.type, (np.timedelta64, np.int64))
return is_integer(element) or isinstance(
- element, (timedelta, np.timedelta64, np.int64))
+ element, (timedelta, np.timedelta64, np.int64)
+ )
def fillna(self, value, **kwargs):
@@ -2492,16 +2661,19 @@ def fillna(self, value, **kwargs):
# interpreted as nanoseconds
if is_integer(value) and not isinstance(value, np.timedelta64):
# Deprecation GH#24694, GH#19233
- warnings.warn("Passing integers to fillna is deprecated, will "
- "raise a TypeError in a future version. To retain "
- "the old behavior, pass pd.Timedelta(seconds=n) "
- "instead.",
- FutureWarning, stacklevel=6)
- value = Timedelta(value, unit='s')
+ warnings.warn(
+ "Passing integers to fillna is deprecated, will "
+ "raise a TypeError in a future version. To retain "
+ "the old behavior, pass pd.Timedelta(seconds=n) "
+ "instead.",
+ FutureWarning,
+ stacklevel=6,
+ )
+ value = Timedelta(value, unit="s")
return super().fillna(value, **kwargs)
def _coerce_values(self, values):
- return values.view('i8')
+ return values.view("i8")
def _try_coerce_args(self, other):
"""
@@ -2523,8 +2695,8 @@ def _try_coerce_args(self, other):
other = tslibs.iNaT
elif isinstance(other, (timedelta, np.timedelta64)):
other = Timedelta(other).value
- elif hasattr(other, 'dtype') and is_timedelta64_dtype(other):
- other = other.astype('i8', copy=False).view('i8')
+ elif hasattr(other, "dtype") and is_timedelta64_dtype(other):
+ other = other.astype("i8", copy=False).view("i8")
else:
# coercion issues
# let higher levels handle
@@ -2536,8 +2708,8 @@ def _try_coerce_result(self, result):
""" reverse of try_coerce_args / try_operate """
if isinstance(result, np.ndarray):
mask = isna(result)
- if result.dtype.kind in ['i', 'f']:
- result = result.astype('m8[ns]')
+ if result.dtype.kind in ["i", "f"]:
+ result = result.astype("m8[ns]")
result[mask] = tslibs.iNaT
elif isinstance(result, (np.integer, np.float)):
@@ -2546,11 +2718,11 @@ def _try_coerce_result(self, result):
return result
def should_store(self, value):
- return (issubclass(value.dtype.type, np.timedelta64) and
- not is_extension_array_dtype(value))
+ return issubclass(
+ value.dtype.type, np.timedelta64
+ ) and not is_extension_array_dtype(value)
- def to_native_types(self, slicer=None, na_rep=None, quoting=None,
- **kwargs):
+ def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
@@ -2560,7 +2732,7 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None,
rvalues = np.empty(values.shape, dtype=object)
if na_rep is None:
- na_rep = 'NaT'
+ na_rep = "NaT"
rvalues[mask] = na_rep
imask = (~mask).ravel()
@@ -2568,9 +2740,10 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None,
# should use the formats.format.Timedelta64Formatter here
# to figure what format to pass to the Timedelta
# e.g. to not show the decimals say
- rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all')
- for val in values.ravel()[imask]],
- dtype=object)
+ rvalues.flat[imask] = np.array(
+ [Timedelta(val)._repr_base(format="all") for val in values.ravel()[imask]],
+ dtype=object,
+ )
return rvalues
def external_values(self, dtype=None):
@@ -2589,17 +2762,25 @@ def _can_hold_element(self, element):
return isinstance(element, (bool, np.bool_))
def should_store(self, value):
- return (issubclass(value.dtype.type, np.bool_) and not
- is_extension_array_dtype(value))
+ return issubclass(value.dtype.type, np.bool_) and not is_extension_array_dtype(
+ value
+ )
- def replace(self, to_replace, value, inplace=False, filter=None,
- regex=False, convert=True):
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ def replace(
+ self, to_replace, value, inplace=False, filter=None, regex=False, convert=True
+ ):
+ inplace = validate_bool_kwarg(inplace, "inplace")
to_replace_values = np.atleast_1d(to_replace)
if not np.can_cast(to_replace_values, bool):
return self
- return super().replace(to_replace, value, inplace=inplace,
- filter=filter, regex=regex, convert=convert)
+ return super().replace(
+ to_replace,
+ value,
+ inplace=inplace,
+ filter=filter,
+ regex=regex,
+ convert=convert,
+ )
class ObjectBlock(Block):
@@ -2630,9 +2811,9 @@ def convert(self, *args, **kwargs):
if args:
raise NotImplementedError
- by_item = kwargs.get('by_item', True)
+ by_item = kwargs.get("by_item", True)
- new_inputs = ['coerce', 'datetime', 'numeric', 'timedelta']
+ new_inputs = ["coerce", "datetime", "numeric", "timedelta"]
new_style = False
for kw in new_inputs:
new_style |= kw in kwargs
@@ -2642,9 +2823,8 @@ def convert(self, *args, **kwargs):
fn_inputs = new_inputs
else:
fn = maybe_convert_objects
- fn_inputs = ['convert_dates', 'convert_numeric',
- 'convert_timedeltas']
- fn_inputs += ['copy']
+ fn_inputs = ["convert_dates", "convert_numeric", "convert_timedeltas"]
+ fn_inputs += ["copy"]
fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs}
@@ -2663,8 +2843,7 @@ def f(m, v, i):
blocks = self.split_and_operate(None, f, False)
else:
values = f(None, self.values.ravel(), None)
- blocks = [make_block(values, ndim=self.ndim,
- placement=self.mgr_locs)]
+ blocks = [make_block(values, ndim=self.ndim, placement=self.mgr_locs)]
return blocks
@@ -2674,8 +2853,7 @@ def _maybe_downcast(self, blocks, downcast=None):
return blocks
# split and convert the blocks
- return _extend_blocks([b.convert(datetime=True, numeric=False)
- for b in blocks])
+ return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks])
def _can_hold_element(self, element):
return True
@@ -2696,16 +2874,21 @@ def _try_coerce_args(self, other):
return other
def should_store(self, value):
- return not (issubclass(value.dtype.type,
- (np.integer, np.floating, np.complexfloating,
- np.datetime64, np.bool_)) or
- # TODO(ExtensionArray): remove is_extension_type
- # when all extension arrays have been ported.
- is_extension_type(value) or
- is_extension_array_dtype(value))
-
- def replace(self, to_replace, value, inplace=False, filter=None,
- regex=False, convert=True):
+ return not (
+ issubclass(
+ value.dtype.type,
+ (np.integer, np.floating, np.complexfloating, np.datetime64, np.bool_),
+ )
+ or
+ # TODO(ExtensionArray): remove is_extension_type
+ # when all extension arrays have been ported.
+ is_extension_type(value)
+ or is_extension_array_dtype(value)
+ )
+
+ def replace(
+ self, to_replace, value, inplace=False, filter=None, regex=False, convert=True
+ ):
to_rep_is_list = is_list_like(to_replace)
value_is_list = is_list_like(value)
both_lists = to_rep_is_list and value_is_list
@@ -2715,19 +2898,35 @@ def replace(self, to_replace, value, inplace=False, filter=None,
blocks = [self]
if not either_list and is_re(to_replace):
- return self._replace_single(to_replace, value, inplace=inplace,
- filter=filter, regex=True,
- convert=convert)
+ return self._replace_single(
+ to_replace,
+ value,
+ inplace=inplace,
+ filter=filter,
+ regex=True,
+ convert=convert,
+ )
elif not (either_list or regex):
- return super().replace(to_replace, value, inplace=inplace,
- filter=filter, regex=regex, convert=convert)
+ return super().replace(
+ to_replace,
+ value,
+ inplace=inplace,
+ filter=filter,
+ regex=regex,
+ convert=convert,
+ )
elif both_lists:
for to_rep, v in zip(to_replace, value):
result_blocks = []
for b in blocks:
- result = b._replace_single(to_rep, v, inplace=inplace,
- filter=filter, regex=regex,
- convert=convert)
+ result = b._replace_single(
+ to_rep,
+ v,
+ inplace=inplace,
+ filter=filter,
+ regex=regex,
+ convert=convert,
+ )
result_blocks = _extend_blocks(result, result_blocks)
blocks = result_blocks
return result_blocks
@@ -2736,19 +2935,37 @@ def replace(self, to_replace, value, inplace=False, filter=None,
for to_rep in to_replace:
result_blocks = []
for b in blocks:
- result = b._replace_single(to_rep, value, inplace=inplace,
- filter=filter, regex=regex,
- convert=convert)
+ result = b._replace_single(
+ to_rep,
+ value,
+ inplace=inplace,
+ filter=filter,
+ regex=regex,
+ convert=convert,
+ )
result_blocks = _extend_blocks(result, result_blocks)
blocks = result_blocks
return result_blocks
- return self._replace_single(to_replace, value, inplace=inplace,
- filter=filter, convert=convert,
- regex=regex)
+ return self._replace_single(
+ to_replace,
+ value,
+ inplace=inplace,
+ filter=filter,
+ convert=convert,
+ regex=regex,
+ )
- def _replace_single(self, to_replace, value, inplace=False, filter=None,
- regex=False, convert=True, mask=None):
+ def _replace_single(
+ self,
+ to_replace,
+ value,
+ inplace=False,
+ filter=None,
+ regex=False,
+ convert=True,
+ mask=None,
+ ):
"""
Replace elements by the given value.
@@ -2772,7 +2989,7 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None,
-------
a new block, the result after replacing
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
# to_replace is regex compilable
to_rep_re = regex and is_re_compilable(to_replace)
@@ -2782,8 +2999,9 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None,
# only one will survive
if to_rep_re and regex_re:
- raise AssertionError('only one of to_replace and regex can be '
- 'regex compilable')
+ raise AssertionError(
+ "only one of to_replace and regex can be " "regex compilable"
+ )
# if regex was passed as something that can be a regex (rather than a
# boolean)
@@ -2805,8 +3023,9 @@ def _replace_single(self, to_replace, value, inplace=False, filter=None,
else:
# if the thing to replace is not a string or compiled regex call
# the superclass method -> to_replace is some kind of object
- return super().replace(to_replace, value, inplace=inplace,
- filter=filter, regex=regex)
+ return super().replace(
+ to_replace, value, inplace=inplace, filter=filter, regex=regex
+ )
new_values = self.values if inplace else self.values.copy()
@@ -2819,6 +3038,7 @@ def re_replacer(s):
return value if rx.search(s) is not None else s
except TypeError:
return s
+
else:
# value is guaranteed to be a string here, s can be either a string
# or null if it's null it gets returned
@@ -2846,8 +3066,9 @@ def re_replacer(s):
block = block.convert(by_item=True, numeric=False)
return block
- def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
- convert=False, mask=None):
+ def _replace_coerce(
+ self, to_replace, value, inplace=True, regex=False, convert=False, mask=None
+ ):
"""
Replace value corresponding to the given boolean array with another
value.
@@ -2873,11 +3094,17 @@ def _replace_coerce(self, to_replace, value, inplace=True, regex=False,
"""
if mask.any():
block = super()._replace_coerce(
- to_replace=to_replace, value=value, inplace=inplace,
- regex=regex, convert=convert, mask=mask)
+ to_replace=to_replace,
+ value=value,
+ inplace=inplace,
+ regex=regex,
+ convert=convert,
+ mask=mask,
+ )
if convert:
- block = [b.convert(by_item=True, numeric=False, copy=True)
- for b in block]
+ block = [
+ b.convert(by_item=True, numeric=False, copy=True) for b in block
+ ]
return block
return self
@@ -2893,9 +3120,7 @@ def __init__(self, values, placement, ndim=None):
from pandas.core.arrays.categorical import _maybe_to_categorical
# coerce to categorical if we can
- super().__init__(_maybe_to_categorical(values),
- placement=placement,
- ndim=ndim)
+ super().__init__(_maybe_to_categorical(values), placement=placement, ndim=ndim)
@property
def _holder(self):
@@ -2913,8 +3138,7 @@ def _try_coerce_result(self, result):
# GH12564: CategoricalBlock is 1-dim only
# while returned results could be any dim
- if ((not is_categorical_dtype(result)) and
- isinstance(result, np.ndarray)):
+ if (not is_categorical_dtype(result)) and isinstance(result, np.ndarray):
result = _block_shape(result, ndim=self.ndim)
return result
@@ -2925,7 +3149,7 @@ def to_dense(self):
# other types.
return self.values._internal_get_values()
- def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
+ def to_native_types(self, slicer=None, na_rep="", quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
@@ -2933,7 +3157,7 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
# Categorical is always one dimension
values = values[slicer]
mask = isna(values)
- values = np.array(values, dtype='object')
+ values = np.array(values, dtype="object")
values[mask] = na_rep
# we are expected to return a 2-d ndarray
@@ -2952,15 +3176,24 @@ def concat_same_type(self, to_concat, placement=None):
1. Change Categorical._concat_same_type to use union_categoricals
2. Delete this method.
"""
- values = self._concatenator([blk.values for blk in to_concat],
- axis=self.ndim - 1)
+ values = self._concatenator(
+ [blk.values for blk in to_concat], axis=self.ndim - 1
+ )
# not using self.make_block_same_class as values can be object dtype
return make_block(
- values, placement=placement or slice(0, len(values), 1),
- ndim=self.ndim)
+ values, placement=placement or slice(0, len(values), 1), ndim=self.ndim
+ )
- def where(self, other, cond, align=True, errors='raise',
- try_cast=False, axis=0, transpose=False):
+ def where(
+ self,
+ other,
+ cond,
+ align=True,
+ errors="raise",
+ try_cast=False,
+ axis=0,
+ transpose=False,
+ ):
# TODO(CategoricalBlock.where):
# This can all be deleted in favor of ExtensionBlock.where once
# we enforce the deprecation.
@@ -2980,16 +3213,22 @@ def where(self, other, cond, align=True, errors='raise',
)
except (TypeError, ValueError):
warnings.warn(object_msg, FutureWarning, stacklevel=6)
- result = self.astype(object).where(other, cond, align=align,
- errors=errors,
- try_cast=try_cast,
- axis=axis, transpose=transpose)
+ result = self.astype(object).where(
+ other,
+ cond,
+ align=align,
+ errors=errors,
+ try_cast=try_cast,
+ axis=axis,
+ transpose=transpose,
+ )
return result
# -----------------------------------------------------------------
# Constructor Helpers
+
def get_block_type(values, dtype=None):
"""
Find the appropriate Block subclass to use for the given values and dtype.
@@ -3036,8 +3275,7 @@ def get_block_type(values, dtype=None):
return cls
-def make_block(values, placement, klass=None, ndim=None, dtype=None,
- fastpath=None):
+def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=None):
# Ensure that we don't allow PandasArray / PandasDtype in internals.
# For now, blocks should be backed by ndarrays when possible.
if isinstance(values, ABCPandasArray):
@@ -3050,8 +3288,10 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None,
if fastpath is not None:
# GH#19265 pyarrow is passing this
- warnings.warn("fastpath argument is deprecated, will be removed "
- "in a future release.", FutureWarning)
+ warnings.warn(
+ "fastpath argument is deprecated, will be removed " "in a future release.",
+ FutureWarning,
+ )
if klass is None:
dtype = dtype or values.dtype
klass = get_block_type(values, dtype)
@@ -3066,9 +3306,11 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None,
# -----------------------------------------------------------------
+
def _extend_blocks(result, blocks=None):
""" return a new extended blocks, givin the result """
from pandas.core.internals import BlockManager
+
if blocks is None:
blocks = []
if isinstance(result, list):
@@ -3093,7 +3335,7 @@ def _block_shape(values, ndim=1, shape=None):
# TODO: https://github.com/pandas-dev/pandas/issues/23023
# block.shape is incorrect for "2D" ExtensionArrays
# We can't, and don't need to, reshape.
- values = values.reshape(tuple((1, ) + shape))
+ values = values.reshape(tuple((1,) + shape))
return values
@@ -3193,13 +3435,14 @@ def _putmask_smart(v, m, n):
# only compare integers/floats
# don't compare integers to datetimelikes
- if (not is_numeric_v_string_like(nn, nn_at) and
- (is_float_dtype(nn.dtype) or
- is_integer_dtype(nn.dtype) and
- is_float_dtype(nn_at.dtype) or
- is_integer_dtype(nn_at.dtype))):
-
- comp = (nn == nn_at)
+ if not is_numeric_v_string_like(nn, nn_at) and (
+ is_float_dtype(nn.dtype)
+ or is_integer_dtype(nn.dtype)
+ and is_float_dtype(nn_at.dtype)
+ or is_integer_dtype(nn_at.dtype)
+ ):
+
+ comp = nn == nn_at
if is_list_like(comp) and comp.all():
nv = v.copy()
nv[m] = nn_at
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
index 6900dfc3c76d8..9ccd4b80869a0 100644
--- a/pandas/core/internals/concat.py
+++ b/pandas/core/internals/concat.py
@@ -9,9 +9,16 @@
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.common import (
- _get_dtype, is_categorical_dtype, is_datetime64_dtype,
- is_datetime64tz_dtype, is_extension_array_dtype, is_float_dtype,
- is_numeric_dtype, is_sparse, is_timedelta64_dtype)
+ _get_dtype,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_extension_array_dtype,
+ is_float_dtype,
+ is_numeric_dtype,
+ is_sparse,
+ is_timedelta64_dtype,
+)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.missing import isna
@@ -54,9 +61,9 @@ def get_mgr_concatenation_plan(mgr, indexers):
blklocs = mgr._blklocs
plan = []
- for blkno, placements in libinternals.get_blkno_placements(blknos,
- mgr.nblocks,
- group=False):
+ for blkno, placements in libinternals.get_blkno_placements(
+ blknos, mgr.nblocks, group=False
+ ):
assert placements.is_slice_like
@@ -72,18 +79,26 @@ def get_mgr_concatenation_plan(mgr, indexers):
blk = mgr.blocks[blkno]
ax0_blk_indexer = blklocs[placements.indexer]
- unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and
- # Fastpath detection of join unit not
- # needing to reindex its block: no ax0
- # reindexing took place and block
- # placement was sequential before.
- ((ax0_indexer is None and
- blk.mgr_locs.is_slice_like and
- blk.mgr_locs.as_slice.step == 1) or
- # Slow-ish detection: all indexer locs
- # are sequential (and length match is
- # checked above).
- (np.diff(ax0_blk_indexer) == 1).all()))
+ unit_no_ax0_reindexing = (
+ len(placements) == len(blk.mgr_locs)
+ and
+ # Fastpath detection of join unit not
+ # needing to reindex its block: no ax0
+ # reindexing took place and block
+ # placement was sequential before.
+ (
+ (
+ ax0_indexer is None
+ and blk.mgr_locs.is_slice_like
+ and blk.mgr_locs.as_slice.step == 1
+ )
+ or
+ # Slow-ish detection: all indexer locs
+ # are sequential (and length match is
+ # checked above).
+ (np.diff(ax0_blk_indexer) == 1).all()
+ )
+ )
# Omit indexer if no item reindexing is required.
if unit_no_ax0_reindexing:
@@ -99,7 +114,6 @@ def get_mgr_concatenation_plan(mgr, indexers):
class JoinUnit:
-
def __init__(self, block, shape, indexers=None):
# Passing shape explicitly is required for cases when block is None.
if indexers is None:
@@ -109,9 +123,9 @@ def __init__(self, block, shape, indexers=None):
self.shape = shape
def __repr__(self):
- return '{name}({block!r}, {indexers})'.format(
- name=self.__class__.__name__, block=self.block,
- indexers=self.indexers)
+ return "{name}({block!r}, {indexers})".format(
+ name=self.__class__.__name__, block=self.block, indexers=self.indexers
+ )
@cache_readonly
def needs_filling(self):
@@ -130,8 +144,7 @@ def dtype(self):
if not self.needs_filling:
return self.block.dtype
else:
- return _get_dtype(maybe_promote(self.block.dtype,
- self.block.fill_value)[0])
+ return _get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0])
@cache_readonly
def is_na(self):
@@ -152,11 +165,11 @@ def is_na(self):
elif self.block.is_extension:
values_flat = values
else:
- values_flat = values.ravel(order='K')
+ values_flat = values.ravel(order="K")
total_len = values_flat.shape[0]
chunk_len = max(total_len // 40, 1000)
for i in range(0, total_len, chunk_len):
- if not isna(values_flat[i:i + chunk_len]).all():
+ if not isna(values_flat[i : i + chunk_len]).all():
return False
return True
@@ -170,24 +183,26 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
fill_value = upcasted_na
if self.is_na:
- if getattr(self.block, 'is_object', False):
+ if getattr(self.block, "is_object", False):
# we want to avoid filling with np.nan if we are
# using None; we already know that we are all
# nulls
- values = self.block.values.ravel(order='K')
+ values = self.block.values.ravel(order="K")
if len(values) and values[0] is None:
fill_value = None
- if (getattr(self.block, 'is_datetimetz', False) or
- is_datetime64tz_dtype(empty_dtype)):
+ if getattr(self.block, "is_datetimetz", False) or is_datetime64tz_dtype(
+ empty_dtype
+ ):
if self.block is None:
array = empty_dtype.construct_array_type()
- return array(np.full(self.shape[1], fill_value.value),
- dtype=empty_dtype)
+ return array(
+ np.full(self.shape[1], fill_value.value), dtype=empty_dtype
+ )
pass
- elif getattr(self.block, 'is_categorical', False):
+ elif getattr(self.block, "is_categorical", False):
pass
- elif getattr(self.block, 'is_extension', False):
+ elif getattr(self.block, "is_extension", False):
pass
else:
missing_arr = np.empty(self.shape, dtype=empty_dtype)
@@ -218,8 +233,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
else:
for ax, indexer in self.indexers.items():
- values = algos.take_nd(values, indexer, axis=ax,
- fill_value=fill_value)
+ values = algos.take_nd(values, indexer, axis=ax, fill_value=fill_value)
return values
@@ -234,9 +248,10 @@ def concatenate_join_units(join_units, concat_axis, copy):
empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units)
- to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype,
- upcasted_na=upcasted_na)
- for ju in join_units]
+ to_concat = [
+ ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)
+ for ju in join_units
+ ]
if len(to_concat) == 1:
# Only one block, nothing to concatenate.
@@ -292,25 +307,25 @@ def get_empty_dtype_and_na(join_units):
continue
if is_categorical_dtype(dtype):
- upcast_cls = 'category'
+ upcast_cls = "category"
elif is_datetime64tz_dtype(dtype):
- upcast_cls = 'datetimetz'
+ upcast_cls = "datetimetz"
elif issubclass(dtype.type, np.bool_):
- upcast_cls = 'bool'
+ upcast_cls = "bool"
elif issubclass(dtype.type, np.object_):
- upcast_cls = 'object'
+ upcast_cls = "object"
elif is_datetime64_dtype(dtype):
- upcast_cls = 'datetime'
+ upcast_cls = "datetime"
elif is_timedelta64_dtype(dtype):
- upcast_cls = 'timedelta'
+ upcast_cls = "timedelta"
elif is_sparse(dtype):
upcast_cls = dtype.subtype.name
elif is_extension_array_dtype(dtype):
- upcast_cls = 'object'
+ upcast_cls = "object"
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
upcast_cls = dtype.name
else:
- upcast_cls = 'float'
+ upcast_cls = "float"
# Null blocks should not influence upcast class selection, unless there
# are only null blocks, when same upcasting rules must be applied to
@@ -324,24 +339,24 @@ def get_empty_dtype_and_na(join_units):
upcast_classes = null_upcast_classes
# create the result
- if 'object' in upcast_classes:
+ if "object" in upcast_classes:
return np.dtype(np.object_), np.nan
- elif 'bool' in upcast_classes:
+ elif "bool" in upcast_classes:
if has_none_blocks:
return np.dtype(np.object_), np.nan
else:
return np.dtype(np.bool_), None
- elif 'category' in upcast_classes:
+ elif "category" in upcast_classes:
return np.dtype(np.object_), np.nan
- elif 'datetimetz' in upcast_classes:
+ elif "datetimetz" in upcast_classes:
# GH-25014. We use NaT instead of iNaT, since this eventually
# ends up in DatetimeArray.take, which does not allow iNaT.
- dtype = upcast_classes['datetimetz']
+ dtype = upcast_classes["datetimetz"]
return dtype[0], tslibs.NaT
- elif 'datetime' in upcast_classes:
- return np.dtype('M8[ns]'), tslibs.iNaT
- elif 'timedelta' in upcast_classes:
- return np.dtype('m8[ns]'), tslibs.iNaT
+ elif "datetime" in upcast_classes:
+ return np.dtype("M8[ns]"), tslibs.iNaT
+ elif "timedelta" in upcast_classes:
+ return np.dtype("m8[ns]"), tslibs.iNaT
else: # pragma
try:
g = np.find_common_type(upcast_classes, [])
@@ -370,21 +385,25 @@ def is_uniform_join_units(join_units):
"""
return (
# all blocks need to have the same type
- all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa
+ all(type(ju.block) is type(join_units[0].block) for ju in join_units)
+ and # noqa
# no blocks that would get missing values (can lead to type upcasts)
# unless we're an extension dtype.
- all(not ju.is_na or ju.block.is_extension for ju in join_units) and
+ all(not ju.is_na or ju.block.is_extension for ju in join_units)
+ and
# no blocks with indexers (as then the dimensions do not fit)
- all(not ju.indexers for ju in join_units) and
+ all(not ju.indexers for ju in join_units)
+ and
# only use this path when there is something to concatenate
- len(join_units) > 1)
+ len(join_units) > 1
+ )
def is_uniform_reindex(join_units):
return (
# TODO: should this be ju.block._can_hold_na?
- all(ju.block and ju.block.is_extension for ju in join_units) and
- len({ju.block.dtype.name for ju in join_units}) == 1
+ all(ju.block and ju.block.is_extension for ju in join_units)
+ and len({ju.block.dtype.name for ju in join_units}) == 1
)
@@ -413,8 +432,7 @@ def trim_join_unit(join_unit, length):
extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
join_unit.shape = (length,) + join_unit.shape[1:]
- return JoinUnit(block=extra_block, indexers=extra_indexers,
- shape=extra_shape)
+ return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape)
def combine_concat_plans(plans, concat_axis):
@@ -471,8 +489,7 @@ def _next_or_none(seq):
if len(plc) > min_len:
# trim_join_unit updates unit in place, so only
# placement needs to be sliced to skip min_len.
- next_items[i] = (plc[min_len:],
- trim_join_unit(unit, min_len))
+ next_items[i] = (plc[min_len:], trim_join_unit(unit, min_len))
else:
yielded_placement = plc
next_items[i] = _next_or_none(plans[i])
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 1044f25a6bbcd..4d64be34e624f 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -12,26 +12,53 @@
from pandas.compat import raise_with_traceback
from pandas.core.dtypes.cast import (
- construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na,
- construct_1d_object_array_from_listlike, infer_dtype_from_scalar,
- maybe_cast_to_datetime, maybe_cast_to_integer_array, maybe_castable,
- maybe_convert_platform, maybe_infer_to_datetimelike, maybe_upcast)
+ construct_1d_arraylike_from_scalar,
+ construct_1d_ndarray_preserving_na,
+ construct_1d_object_array_from_listlike,
+ infer_dtype_from_scalar,
+ maybe_cast_to_datetime,
+ maybe_cast_to_integer_array,
+ maybe_castable,
+ maybe_convert_platform,
+ maybe_infer_to_datetimelike,
+ maybe_upcast,
+)
from pandas.core.dtypes.common import (
- is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal,
- is_extension_array_dtype, is_extension_type, is_float_dtype,
- is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype)
+ is_categorical_dtype,
+ is_datetime64tz_dtype,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_extension_type,
+ is_float_dtype,
+ is_integer_dtype,
+ is_iterator,
+ is_list_like,
+ is_object_dtype,
+ pandas_dtype,
+)
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, ABCSeries,
- ABCTimedeltaIndex)
+ ABCDataFrame,
+ ABCDatetimeIndex,
+ ABCIndexClass,
+ ABCPeriodIndex,
+ ABCSeries,
+ ABCTimedeltaIndex,
+)
from pandas.core.dtypes.missing import isna
from pandas.core import algorithms, common as com
from pandas.core.arrays import Categorical, ExtensionArray, period_array
from pandas.core.index import (
- Index, _get_objs_combined_axis, _union_indexes, ensure_index)
+ Index,
+ _get_objs_combined_axis,
+ _union_indexes,
+ ensure_index,
+)
from pandas.core.indexes import base as ibase
from pandas.core.internals import (
- create_block_manager_from_arrays, create_block_manager_from_blocks)
+ create_block_manager_from_arrays,
+ create_block_manager_from_blocks,
+)
from pandas.core.internals.arrays import extract_array
# ---------------------------------------------------------------------
@@ -101,6 +128,7 @@ def masked_rec_array_to_mgr(data, index, columns, dtype, copy):
# ---------------------------------------------------------------------
# DataFrame Constructor Interface
+
def init_ndarray(values, index, columns, dtype=None, copy=False):
# input must be a ndarray, list, Series, index
@@ -119,24 +147,23 @@ def init_ndarray(values, index, columns, dtype=None, copy=False):
# we could have a categorical type passed or coerced to 'category'
# recast this to an arrays_to_mgr
- if (is_categorical_dtype(getattr(values, 'dtype', None)) or
- is_categorical_dtype(dtype)):
+ if is_categorical_dtype(getattr(values, "dtype", None)) or is_categorical_dtype(
+ dtype
+ ):
- if not hasattr(values, 'dtype'):
+ if not hasattr(values, "dtype"):
values = prep_ndarray(values, copy=copy)
values = values.ravel()
elif copy:
values = values.copy()
index, columns = _get_axes(len(values), 1, index, columns)
- return arrays_to_mgr([values], columns, index, columns,
- dtype=dtype)
+ return arrays_to_mgr([values], columns, index, columns, dtype=dtype)
elif is_extension_array_dtype(values):
# GH#19157
if columns is None:
columns = [0]
- return arrays_to_mgr([values], columns, index, columns,
- dtype=dtype)
+ return arrays_to_mgr([values], columns, index, columns, dtype=dtype)
# by definition an array here
# the dtypes will be coerced to a single dtype
@@ -147,9 +174,10 @@ def init_ndarray(values, index, columns, dtype=None, copy=False):
try:
values = values.astype(dtype)
except Exception as orig:
- e = ValueError("failed to cast to '{dtype}' (Exception "
- "was: {orig})".format(dtype=dtype,
- orig=orig))
+ e = ValueError(
+ "failed to cast to '{dtype}' (Exception "
+ "was: {orig})".format(dtype=dtype, orig=orig)
+ )
raise_with_traceback(e)
index, columns = _get_axes(*values.shape, index=index, columns=columns)
@@ -171,8 +199,9 @@ def init_ndarray(values, index, columns, dtype=None, copy=False):
from pandas.core.internals.blocks import make_block
# TODO: What about re-joining object columns?
- block_values = [make_block(dvals_list[n], placement=[n])
- for n in range(len(dvals_list))]
+ block_values = [
+ make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list))
+ ]
else:
datelike_vals = maybe_infer_to_datetimelike(values)
@@ -190,6 +219,7 @@ def init_dict(data, index, columns, dtype=None):
"""
if columns is not None:
from pandas.core.series import Series
+
arrays = Series(data, index=columns, dtype=object)
data_names = arrays.index
@@ -208,8 +238,7 @@ def init_dict(data, index, columns, dtype=None):
nan_dtype = object
else:
nan_dtype = dtype
- val = construct_1d_arraylike_from_scalar(np.nan, len(index),
- nan_dtype)
+ val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
arrays.loc[missing] = [val] * missing.sum()
else:
@@ -218,15 +247,18 @@ def init_dict(data, index, columns, dtype=None):
arrays = (com.maybe_iterable_to_list(data[k]) for k in keys)
# GH#24096 need copy to be deep for datetime64tz case
# TODO: See if we can avoid these copies
- arrays = [arr if not isinstance(arr, ABCIndexClass) else arr._data
- for arr in arrays]
- arrays = [arr if not is_datetime64tz_dtype(arr) else
- arr.copy() for arr in arrays]
+ arrays = [
+ arr if not isinstance(arr, ABCIndexClass) else arr._data for arr in arrays
+ ]
+ arrays = [
+ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
+ ]
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
# ---------------------------------------------------------------------
+
def prep_ndarray(values, copy=True):
if not isinstance(values, (np.ndarray, ABCSeries, Index)):
if len(values) == 0:
@@ -239,7 +271,7 @@ def convert(v):
# this is equiv of np.asarray, but does object conversion
# and platform dtype preservation
try:
- if is_list_like(values[0]) or hasattr(values[0], 'len'):
+ if is_list_like(values[0]) or hasattr(values[0], "len"):
values = np.array([convert(v) for v in values])
elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:
# GH#21861
@@ -259,7 +291,7 @@ def convert(v):
if values.ndim == 1:
values = values.reshape((values.shape[0], 1))
elif values.ndim != 2:
- raise ValueError('Must pass 2-d input')
+ raise ValueError("Must pass 2-d input")
return values
@@ -279,15 +311,16 @@ def _homogenize(data, index, dtype=None):
else:
if isinstance(val, dict):
if oindex is None:
- oindex = index.astype('O')
+ oindex = index.astype("O")
if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)):
val = com.dict_compat(val)
else:
val = dict(val)
val = lib.fast_multiget(val, oindex.values, default=np.nan)
- val = sanitize_array(val, index, dtype=dtype, copy=False,
- raise_cast_failure=False)
+ val = sanitize_array(
+ val, index, dtype=dtype, copy=False, raise_cast_failure=False
+ )
homogenized.append(val)
@@ -313,13 +346,12 @@ def extract_index(data):
elif isinstance(val, dict):
have_dicts = True
indexes.append(list(val.keys()))
- elif is_list_like(val) and getattr(val, 'ndim', 1) == 1:
+ elif is_list_like(val) and getattr(val, "ndim", 1) == 1:
have_raw_arrays = True
raw_lengths.append(len(val))
if not indexes and not raw_lengths:
- raise ValueError('If using all scalar values, you must pass'
- ' an index')
+ raise ValueError("If using all scalar values, you must pass" " an index")
if have_series or have_dicts:
index = _union_indexes(indexes)
@@ -327,17 +359,19 @@ def extract_index(data):
if have_raw_arrays:
lengths = list(set(raw_lengths))
if len(lengths) > 1:
- raise ValueError('arrays must all be same length')
+ raise ValueError("arrays must all be same length")
if have_dicts:
- raise ValueError('Mixing dicts with non-Series may lead to '
- 'ambiguous ordering.')
+ raise ValueError(
+ "Mixing dicts with non-Series may lead to " "ambiguous ordering."
+ )
if have_series:
if lengths[0] != len(index):
- msg = ('array length {length} does not match index '
- 'length {idx_len}'
- .format(length=lengths[0], idx_len=len(index)))
+ msg = (
+ "array length {length} does not match index "
+ "length {idx_len}".format(length=lengths[0], idx_len=len(index))
+ )
raise ValueError(msg)
else:
index = ibase.default_index(lengths[0])
@@ -347,8 +381,12 @@ def extract_index(data):
def reorder_arrays(arrays, arr_columns, columns):
# reorder according to the columns
- if (columns is not None and len(columns) and arr_columns is not None and
- len(arr_columns)):
+ if (
+ columns is not None
+ and len(columns)
+ and arr_columns is not None
+ and len(arr_columns)
+ ):
indexer = ensure_index(arr_columns).get_indexer(columns)
arr_columns = ensure_index([arr_columns[i] for i in indexer])
arrays = [arrays[i] for i in indexer]
@@ -356,18 +394,18 @@ def reorder_arrays(arrays, arr_columns, columns):
def get_names_from_index(data):
- has_some_name = any(getattr(s, 'name', None) is not None for s in data)
+ has_some_name = any(getattr(s, "name", None) is not None for s in data)
if not has_some_name:
return ibase.default_index(len(data))
index = list(range(len(data)))
count = 0
for i, s in enumerate(data):
- n = getattr(s, 'name', None)
+ n = getattr(s, "name", None)
if n is not None:
index[i] = n
else:
- index[i] = 'Unnamed {count}'.format(count=count)
+ index[i] = "Unnamed {count}".format(count=count)
count += 1
return index
@@ -392,14 +430,18 @@ def _get_axes(N, K, index, columns):
# ---------------------------------------------------------------------
# Conversion of Inputs to Arrays
+
def to_arrays(data, columns, coerce_float=False, dtype=None):
"""
Return list of arrays, columns.
"""
if isinstance(data, ABCDataFrame):
if columns is not None:
- arrays = [data._ixs(i, axis=1).values
- for i, col in enumerate(data.columns) if col in columns]
+ arrays = [
+ data._ixs(i, axis=1).values
+ for i, col in enumerate(data.columns)
+ if col in columns
+ ]
else:
columns = data.columns
arrays = [data._ixs(i, axis=1).values for i in range(len(columns))]
@@ -413,21 +455,23 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
return [[]] * len(columns), columns
return [], [] # columns if columns is not None else []
if isinstance(data[0], (list, tuple)):
- return _list_to_arrays(data, columns, coerce_float=coerce_float,
- dtype=dtype)
+ return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
elif isinstance(data[0], abc.Mapping):
- return _list_of_dict_to_arrays(data, columns,
- coerce_float=coerce_float, dtype=dtype)
+ return _list_of_dict_to_arrays(
+ data, columns, coerce_float=coerce_float, dtype=dtype
+ )
elif isinstance(data[0], ABCSeries):
- return _list_of_series_to_arrays(data, columns,
- coerce_float=coerce_float,
- dtype=dtype)
+ return _list_of_series_to_arrays(
+ data, columns, coerce_float=coerce_float, dtype=dtype
+ )
elif isinstance(data[0], Categorical):
if columns is None:
columns = ibase.default_index(len(data))
return data, columns
- elif (isinstance(data, (np.ndarray, ABCSeries, Index)) and
- data.dtype.names is not None):
+ elif (
+ isinstance(data, (np.ndarray, ABCSeries, Index))
+ and data.dtype.names is not None
+ ):
columns = list(data.dtype.names)
arrays = [data[k] for k in columns]
@@ -435,8 +479,7 @@ def to_arrays(data, columns, coerce_float=False, dtype=None):
else:
# last ditch effort
data = [tuple(x) for x in data]
- return _list_to_arrays(data, columns, coerce_float=coerce_float,
- dtype=dtype)
+ return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
@@ -447,8 +490,9 @@ def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
content = list(lib.to_object_array(data).T)
# gh-26429 do not raise user-facing AssertionError
try:
- result = _convert_object_array(content, columns, dtype=dtype,
- coerce_float=coerce_float)
+ result = _convert_object_array(
+ content, columns, dtype=dtype, coerce_float=coerce_float
+ )
except AssertionError as e:
raise ValueError(e) from e
return result
@@ -462,7 +506,7 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
aligned_values = []
for s in data:
- index = getattr(s, 'index', None)
+ index = getattr(s, "index", None)
if index is None:
index = ibase.default_index(len(s))
@@ -478,8 +522,9 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
if values.dtype == np.object_:
content = list(values.T)
- return _convert_object_array(content, columns, dtype=dtype,
- coerce_float=coerce_float)
+ return _convert_object_array(
+ content, columns, dtype=dtype, coerce_float=coerce_float
+ )
else:
return values.T, columns
@@ -495,8 +540,9 @@ def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
data = [(type(d) is dict) and d or dict(d) for d in data]
content = list(lib.dicts_to_array(data, list(columns)).T)
- return _convert_object_array(content, columns, dtype=dtype,
- coerce_float=coerce_float)
+ return _convert_object_array(
+ content, columns, dtype=dtype, coerce_float=coerce_float
+ )
def _convert_object_array(content, columns, coerce_float=False, dtype=None):
@@ -505,9 +551,10 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None):
else:
if len(columns) != len(content): # pragma: no cover
# caller's responsibility to check for this...
- raise AssertionError('{col:d} columns passed, passed data had '
- '{con} columns'.format(col=len(columns),
- con=len(content)))
+ raise AssertionError(
+ "{col:d} columns passed, passed data had "
+ "{con} columns".format(col=len(columns), con=len(content))
+ )
# provide soft conversion of object dtypes
def convert(arr):
@@ -524,6 +571,7 @@ def convert(arr):
# ---------------------------------------------------------------------
# Series-Based
+
def sanitize_index(data, index, copy=False):
"""
Sanitize an index type to return an ndarray of the underlying, pass
@@ -534,7 +582,7 @@ def sanitize_index(data, index, copy=False):
return data
if len(data) != len(index):
- raise ValueError('Length of values does not match length of index')
+ raise ValueError("Length of values does not match length of index")
if isinstance(data, ABCIndexClass) and not copy:
pass
@@ -546,14 +594,13 @@ def sanitize_index(data, index, copy=False):
elif isinstance(data, np.ndarray):
# coerce datetimelike types
- if data.dtype.kind in ['M', 'm']:
+ if data.dtype.kind in ["M", "m"]:
data = sanitize_array(data, index, copy=copy)
return data
-def sanitize_array(data, index, dtype=None, copy=False,
- raise_cast_failure=False):
+def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False):
"""
Sanitize input data to an ndarray, copy if specified, coerce to the
dtype if specified.
@@ -576,8 +623,7 @@ def sanitize_array(data, index, dtype=None, copy=False,
# GH#846
if isinstance(data, np.ndarray):
- if (dtype is not None
- and is_float_dtype(data.dtype) and is_integer_dtype(dtype)):
+ if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype):
# possibility of nan -> garbage
try:
subarr = _try_cast(data, dtype, copy, True)
@@ -617,13 +663,13 @@ def sanitize_array(data, index, dtype=None, copy=False,
elif isinstance(data, range):
# GH#16804
- arr = np.arange(data.start, data.stop, data.step, dtype='int64')
+ arr = np.arange(data.start, data.stop, data.step, dtype="int64")
subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
else:
subarr = _try_cast(data, dtype, copy, raise_cast_failure)
# scalar like, GH
- if getattr(subarr, 'ndim', 0) == 0:
+ if getattr(subarr, "ndim", 0) == 0:
if isinstance(data, list): # pragma: no cover
subarr = np.array(data, dtype=object)
elif index is not None:
@@ -636,8 +682,7 @@ def sanitize_array(data, index, dtype=None, copy=False,
# need to possibly convert the value here
value = maybe_cast_to_datetime(value, dtype)
- subarr = construct_1d_arraylike_from_scalar(
- value, len(index), dtype)
+ subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype)
else:
return subarr.item()
@@ -649,11 +694,12 @@ def sanitize_array(data, index, dtype=None, copy=False,
# a 1-element ndarray
if len(subarr) != len(index) and len(subarr) == 1:
subarr = construct_1d_arraylike_from_scalar(
- subarr[0], len(index), subarr.dtype)
+ subarr[0], len(index), subarr.dtype
+ )
elif subarr.ndim > 1:
if isinstance(data, np.ndarray):
- raise Exception('Data must be 1-dimensional')
+ raise Exception("Data must be 1-dimensional")
else:
subarr = com.asarray_tuplesafe(data, dtype=dtype)
@@ -668,12 +714,13 @@ def sanitize_array(data, index, dtype=None, copy=False,
data = np.array(data, dtype=dtype, copy=False)
subarr = np.array(data, dtype=object, copy=copy)
- if (not (is_extension_array_dtype(subarr.dtype) or
- is_extension_array_dtype(dtype)) and
- is_object_dtype(subarr.dtype) and
- not is_object_dtype(dtype)):
+ if (
+ not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype))
+ and is_object_dtype(subarr.dtype)
+ and not is_object_dtype(dtype)
+ ):
inferred = lib.infer_dtype(subarr, skipna=False)
- if inferred == 'period':
+ if inferred == "period":
try:
subarr = period_array(subarr)
except IncompatibleFrequency:
@@ -710,13 +757,13 @@ def _try_cast(arr, dtype, copy, raise_cast_failure):
subarr = maybe_cast_to_datetime(arr, dtype)
# Take care in creating object arrays (but iterators are not
# supported):
- if is_object_dtype(dtype) and (is_list_like(subarr) and
- not (is_iterator(subarr) or
- isinstance(subarr, np.ndarray))):
+ if is_object_dtype(dtype) and (
+ is_list_like(subarr)
+ and not (is_iterator(subarr) or isinstance(subarr, np.ndarray))
+ ):
subarr = construct_1d_object_array_from_listlike(subarr)
elif not is_extension_type(subarr):
- subarr = construct_1d_ndarray_preserving_na(subarr, dtype,
- copy=copy)
+ subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy)
except OutOfBoundsDatetime:
# in case of out of bound datetime64 -> always raise
raise
@@ -724,8 +771,7 @@ def _try_cast(arr, dtype, copy, raise_cast_failure):
if is_categorical_dtype(dtype):
# We *do* allow casting to categorical, since we know
# that Categorical is the only array type for 'category'.
- subarr = Categorical(arr, dtype.categories,
- ordered=dtype._ordered)
+ subarr = Categorical(arr, dtype.categories, ordered=dtype._ordered)
elif is_extension_array_dtype(dtype):
# create an extension array from its dtype
array_type = dtype.construct_array_type()._from_sequence
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index cdf0826bbe21e..c5254aaa4af5f 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -11,12 +11,21 @@
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.cast import (
- find_common_type, infer_dtype_from_scalar, maybe_convert_objects,
- maybe_promote)
+ find_common_type,
+ infer_dtype_from_scalar,
+ maybe_convert_objects,
+ maybe_promote,
+)
from pandas.core.dtypes.common import (
- _NS_DTYPE, is_datetimelike_v_numeric, is_extension_array_dtype,
- is_extension_type, is_list_like, is_numeric_v_string_like, is_scalar,
- is_sparse)
+ _NS_DTYPE,
+ is_datetimelike_v_numeric,
+ is_extension_array_dtype,
+ is_extension_type,
+ is_list_like,
+ is_numeric_v_string_like,
+ is_scalar,
+ is_sparse,
+)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries
@@ -30,12 +39,23 @@
from pandas.io.formats.printing import pprint_thing
from .blocks import (
- Block, CategoricalBlock, DatetimeTZBlock, ExtensionBlock,
- ObjectValuesExtensionBlock, _extend_blocks, _merge_blocks, _safe_reshape,
- get_block_type, make_block)
+ Block,
+ CategoricalBlock,
+ DatetimeTZBlock,
+ ExtensionBlock,
+ ObjectValuesExtensionBlock,
+ _extend_blocks,
+ _merge_blocks,
+ _safe_reshape,
+ get_block_type,
+ make_block,
+)
from .concat import ( # all for concatenate_block_managers
- combine_concat_plans, concatenate_join_units, get_mgr_concatenation_plan,
- is_uniform_join_units)
+ combine_concat_plans,
+ concatenate_join_units,
+ get_mgr_concatenation_plan,
+ is_uniform_join_units,
+)
# TODO: flexible with index=None and/or items=None
@@ -91,22 +111,33 @@ class BlockManager(PandasObject):
-----
This is *not* a public API class
"""
- __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated',
- '_is_consolidated', '_blknos', '_blklocs']
- def __init__(self,
- blocks: Sequence[Block],
- axes: Sequence[Index],
- do_integrity_check: bool = True):
+ __slots__ = [
+ "axes",
+ "blocks",
+ "_ndim",
+ "_shape",
+ "_known_consolidated",
+ "_is_consolidated",
+ "_blknos",
+ "_blklocs",
+ ]
+
+ def __init__(
+ self,
+ blocks: Sequence[Block],
+ axes: Sequence[Index],
+ do_integrity_check: bool = True,
+ ):
self.axes = [ensure_index(ax) for ax in axes]
self.blocks = tuple(blocks) # type: Tuple[Block, ...]
for block in blocks:
if self.ndim != block.ndim:
raise AssertionError(
- 'Number of Block dimensions ({block}) must equal '
- 'number of axes ({self})'.format(block=block.ndim,
- self=self.ndim))
+ "Number of Block dimensions ({block}) must equal "
+ "number of axes ({self})".format(block=block.ndim, self=self.ndim)
+ )
if do_integrity_check:
self._verify_integrity()
@@ -118,8 +149,7 @@ def __init__(self,
def make_empty(self, axes=None):
""" return an empty BlockManager with the items axis of len 0 """
if axes is None:
- axes = [ensure_index([])] + [ensure_index(a)
- for a in self.axes[1:]]
+ axes = [ensure_index([])] + [ensure_index(a) for a in self.axes[1:]]
# preserve dtype if possible
if self.ndim == 1:
@@ -149,8 +179,9 @@ def set_axis(self, axis, new_labels):
if new_len != old_len:
raise ValueError(
- 'Length mismatch: Expected axis has {old} elements, new '
- 'values have {new} elements'.format(old=old_len, new=new_len))
+ "Length mismatch: Expected axis has {old} elements, new "
+ "values have {new} elements".format(old=old_len, new=new_len)
+ )
self.axes[axis] = new_labels
@@ -178,8 +209,9 @@ def _is_single_block(self):
return False
blk = self.blocks[0]
- return (blk.mgr_locs.is_slice_like and
- blk.mgr_locs.as_slice == slice(0, len(self), 1))
+ return blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice == slice(
+ 0, len(self), 1
+ )
def _rebuild_blknos_and_blklocs(self):
"""
@@ -234,10 +266,12 @@ def __getstate__(self):
axes_array = [ax for ax in self.axes]
extra_state = {
- '0.14.1': {
- 'axes': axes_array,
- 'blocks': [dict(values=b.values, mgr_locs=b.mgr_locs.indexer)
- for b in self.blocks]
+ "0.14.1": {
+ "axes": axes_array,
+ "blocks": [
+ dict(values=b.values, mgr_locs=b.mgr_locs.indexer)
+ for b in self.blocks
+ ],
}
}
@@ -249,12 +283,12 @@ def __setstate__(self, state):
def unpickle_block(values, mgr_locs):
return make_block(values, placement=mgr_locs)
- if (isinstance(state, tuple) and len(state) >= 4 and
- '0.14.1' in state[3]):
- state = state[3]['0.14.1']
- self.axes = [ensure_index(ax) for ax in state['axes']]
- self.blocks = tuple(unpickle_block(b['values'], b['mgr_locs'])
- for b in state['blocks'])
+ if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
+ state = state[3]["0.14.1"]
+ self.axes = [ensure_index(ax) for ax in state["axes"]]
+ self.blocks = tuple(
+ unpickle_block(b["values"], b["mgr_locs"]) for b in state["blocks"]
+ )
else:
# discard anything after 3rd, support beta pickling format for a
# little while longer
@@ -272,12 +306,14 @@ def unpickle_block(values, mgr_locs):
# block items corresponded to manager items 1-to-1.
all_mgr_locs = [slice(0, len(bitems[0]))]
else:
- all_mgr_locs = [self.axes[0].get_indexer(blk_items)
- for blk_items in bitems]
+ all_mgr_locs = [
+ self.axes[0].get_indexer(blk_items) for blk_items in bitems
+ ]
self.blocks = tuple(
unpickle_block(values, mgr_locs)
- for values, mgr_locs in zip(bvalues, all_mgr_locs))
+ for values, mgr_locs in zip(bvalues, all_mgr_locs)
+ )
self._post_setstate()
@@ -293,12 +329,12 @@ def __repr__(self):
output = pprint_thing(self.__class__.__name__)
for i, ax in enumerate(self.axes):
if i == 0:
- output += '\nItems: {ax}'.format(ax=ax)
+ output += "\nItems: {ax}".format(ax=ax)
else:
- output += '\nAxis {i}: {ax}'.format(i=i, ax=ax)
+ output += "\nAxis {i}: {ax}".format(i=i, ax=ax)
for block in self.blocks:
- output += '\n{block}'.format(block=pprint_thing(block))
+ output += "\n{block}".format(block=pprint_thing(block))
return output
def _verify_integrity(self):
@@ -308,13 +344,21 @@ def _verify_integrity(self):
if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
construction_error(tot_items, block.shape[1:], self.axes)
if len(self.items) != tot_items:
- raise AssertionError('Number of manager items must equal union of '
- 'block items\n# manager items: {0}, # '
- 'tot_items: {1}'.format(
- len(self.items), tot_items))
+ raise AssertionError(
+ "Number of manager items must equal union of "
+ "block items\n# manager items: {0}, # "
+ "tot_items: {1}".format(len(self.items), tot_items)
+ )
- def apply(self, f, axes=None, filter=None, do_integrity_check=False,
- consolidate=True, **kwargs):
+ def apply(
+ self,
+ f,
+ axes=None,
+ filter=None,
+ do_integrity_check=False,
+ consolidate=True,
+ **kwargs
+ ):
"""
iterate over the blocks, collect and create a new block manager
@@ -344,37 +388,39 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False,
# All items are included, as if there were no filtering
filter = None
else:
- kwargs['filter'] = filter_locs
+ kwargs["filter"] = filter_locs
if consolidate:
self._consolidate_inplace()
- if f == 'where':
+ if f == "where":
align_copy = True
- if kwargs.get('align', True):
- align_keys = ['other', 'cond']
+ if kwargs.get("align", True):
+ align_keys = ["other", "cond"]
else:
- align_keys = ['cond']
- elif f == 'putmask':
+ align_keys = ["cond"]
+ elif f == "putmask":
align_copy = False
- if kwargs.get('align', True):
- align_keys = ['new', 'mask']
+ if kwargs.get("align", True):
+ align_keys = ["new", "mask"]
else:
- align_keys = ['mask']
- elif f == 'fillna':
+ align_keys = ["mask"]
+ elif f == "fillna":
# fillna internally does putmask, maybe it's better to do this
# at mgr, not block level?
align_copy = False
- align_keys = ['value']
+ align_keys = ["value"]
else:
align_keys = []
# TODO(EA): may interfere with ExtensionBlock.setitem for blocks
# with a .values attribute.
- aligned_args = {k: kwargs[k]
- for k in align_keys
- if not isinstance(kwargs[k], ABCExtensionArray) and
- hasattr(kwargs[k], 'values')}
+ aligned_args = {
+ k: kwargs[k]
+ for k in align_keys
+ if not isinstance(kwargs[k], ABCExtensionArray)
+ and hasattr(kwargs[k], "values")
+ }
for b in self.blocks:
if filter is not None:
@@ -386,22 +432,29 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False,
b_items = self.items[b.mgr_locs.indexer]
for k, obj in aligned_args.items():
- axis = getattr(obj, '_info_axis_number', 0)
- kwargs[k] = obj.reindex(b_items, axis=axis,
- copy=align_copy)
+ axis = getattr(obj, "_info_axis_number", 0)
+ kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
applied = getattr(b, f)(**kwargs)
result_blocks = _extend_blocks(applied, result_blocks)
if len(result_blocks) == 0:
return self.make_empty(axes or self.axes)
- bm = self.__class__(result_blocks, axes or self.axes,
- do_integrity_check=do_integrity_check)
+ bm = self.__class__(
+ result_blocks, axes or self.axes, do_integrity_check=do_integrity_check
+ )
bm._consolidate_inplace()
return bm
- def quantile(self, axis=0, consolidate=True, transposed=False,
- interpolation='linear', qs=None, numeric_only=None):
+ def quantile(
+ self,
+ axis=0,
+ consolidate=True,
+ transposed=False,
+ interpolation="linear",
+ qs=None,
+ numeric_only=None,
+ ):
"""
Iterate over blocks applying quantile reduction.
This routine is intended for reduction type operations and
@@ -434,6 +487,7 @@ def get_axe(block, qs, axes):
# Because Series dispatches to DataFrame, we will always have
# block.ndim == 2
from pandas import Float64Index
+
if is_list_like(qs):
ax = Float64Index(qs)
else:
@@ -466,14 +520,14 @@ def get_axe(block, qs, axes):
b.mgr_locs = sb.mgr_locs
else:
- new_axes[axis] = Index(np.concatenate(
- [ax.values for ax in axes]))
+ new_axes[axis] = Index(np.concatenate([ax.values for ax in axes]))
if transposed:
new_axes = new_axes[::-1]
- blocks = [b.make_block(b.values.T,
- placement=np.arange(b.shape[1])
- ) for b in blocks]
+ blocks = [
+ b.make_block(b.values.T, placement=np.arange(b.shape[1]))
+ for b in blocks
+ ]
return self.__class__(blocks, new_axes)
@@ -493,51 +547,49 @@ def get_axe(block, qs, axes):
values = values.take(indexer)
return SingleBlockManager(
- [make_block(values,
- ndim=1,
- placement=np.arange(len(values)))],
- axes[0])
+ [make_block(values, ndim=1, placement=np.arange(len(values)))], axes[0]
+ )
def isna(self, func, **kwargs):
- return self.apply('apply', func=func, **kwargs)
+ return self.apply("apply", func=func, **kwargs)
def where(self, **kwargs):
- return self.apply('where', **kwargs)
+ return self.apply("where", **kwargs)
def setitem(self, **kwargs):
- return self.apply('setitem', **kwargs)
+ return self.apply("setitem", **kwargs)
def putmask(self, **kwargs):
- return self.apply('putmask', **kwargs)
+ return self.apply("putmask", **kwargs)
def diff(self, **kwargs):
- return self.apply('diff', **kwargs)
+ return self.apply("diff", **kwargs)
def interpolate(self, **kwargs):
- return self.apply('interpolate', **kwargs)
+ return self.apply("interpolate", **kwargs)
def shift(self, **kwargs):
- return self.apply('shift', **kwargs)
+ return self.apply("shift", **kwargs)
def fillna(self, **kwargs):
- return self.apply('fillna', **kwargs)
+ return self.apply("fillna", **kwargs)
def downcast(self, **kwargs):
- return self.apply('downcast', **kwargs)
+ return self.apply("downcast", **kwargs)
def astype(self, dtype, **kwargs):
- return self.apply('astype', dtype=dtype, **kwargs)
+ return self.apply("astype", dtype=dtype, **kwargs)
def convert(self, **kwargs):
- return self.apply('convert', **kwargs)
+ return self.apply("convert", **kwargs)
def replace(self, **kwargs):
- return self.apply('replace', **kwargs)
+ return self.apply("replace", **kwargs)
def replace_list(self, src_list, dest_list, inplace=False, regex=False):
""" do a list replace """
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
# figure out our mask a-priori to avoid repeated replacements
values = self.as_array()
@@ -549,9 +601,10 @@ def comp(s, regex=False):
"""
if isna(s):
return isna(values)
- if hasattr(s, 'asm8'):
- return _compare_or_regex_search(maybe_convert_objects(values),
- getattr(s, 'asm8'), regex)
+ if hasattr(s, "asm8"):
+ return _compare_or_regex_search(
+ maybe_convert_objects(values), getattr(s, "asm8"), regex
+ )
return _compare_or_regex_search(values, s, regex)
masks = [comp(s, regex) for i, s in enumerate(src_list)]
@@ -568,9 +621,14 @@ def comp(s, regex=False):
for b in rb:
m = masks[i][b.mgr_locs.indexer]
convert = i == src_len
- result = b._replace_coerce(mask=m, to_replace=s, value=d,
- inplace=inplace,
- convert=convert, regex=regex)
+ result = b._replace_coerce(
+ mask=m,
+ to_replace=s,
+ value=d,
+ inplace=inplace,
+ convert=convert,
+ regex=regex,
+ )
if m.any():
new_rb = _extend_blocks(result, new_rb)
else:
@@ -659,15 +717,15 @@ def combine(self, blocks, copy=True):
return self.make_empty()
# FIXME: optimization potential
- indexer = np.sort(np.concatenate([b.mgr_locs.as_array
- for b in blocks]))
+ indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
new_blocks = []
for b in blocks:
b = b.copy(deep=copy)
- b.mgr_locs = algos.take_1d(inv_indexer, b.mgr_locs.as_array,
- axis=0, allow_fill=False)
+ b.mgr_locs = algos.take_1d(
+ inv_indexer, b.mgr_locs.as_array, axis=0, allow_fill=False
+ )
new_blocks.append(b)
axes = list(self.axes)
@@ -717,15 +775,14 @@ def copy(self, deep=True):
"""
# this preserves the notion of view copying of axes
if deep:
- if deep == 'all':
+ if deep == "all":
copy = lambda ax: ax.copy(deep=True)
else:
copy = lambda ax: ax.view()
new_axes = [copy(ax) for ax in self.axes]
else:
new_axes = list(self.axes)
- return self.apply('copy', axes=new_axes, deep=deep,
- do_integrity_check=False)
+ return self.apply("copy", axes=new_axes, deep=deep, do_integrity_check=False)
def as_array(self, transpose=False, items=None):
"""Convert the blockmanager data into an numpy array.
@@ -775,7 +832,7 @@ def _interleave(self):
if is_sparse(dtype):
dtype = dtype.subtype
elif is_extension_array_dtype(dtype):
- dtype = 'object'
+ dtype = "object"
result = np.empty(self.shape, dtype=dtype)
@@ -787,7 +844,7 @@ def _interleave(self):
itemmask[rl.indexer] = 1
if not itemmask.all():
- raise AssertionError('Some items were not contained in blocks')
+ raise AssertionError("Some items were not contained in blocks")
return result
@@ -813,8 +870,7 @@ def to_dict(self, copy=True):
for b in self.blocks:
bd.setdefault(str(b.dtype), []).append(b)
- return {dtype: self.combine(blocks, copy=copy)
- for dtype, blocks in bd.items()}
+ return {dtype: self.combine(blocks, copy=copy) for dtype, blocks in bd.items()}
def fast_xs(self, loc):
"""
@@ -853,9 +909,7 @@ def fast_xs(self, loc):
result[rl] = blk._try_coerce_result(blk.iget((i, loc)))
if is_extension_array_dtype(dtype):
- result = dtype.construct_array_type()._from_sequence(
- result, dtype=dtype
- )
+ result = dtype.construct_array_type()._from_sequence(result, dtype=dtype)
return result
@@ -907,9 +961,9 @@ def get(self, item, fastpath=True):
raise TypeError("cannot label index with a null key")
indexer = self.items.get_indexer_for([item])
- return self.reindex_indexer(new_axis=self.items[indexer],
- indexer=indexer, axis=0,
- allow_dups=True)
+ return self.reindex_indexer(
+ new_axis=self.items[indexer], indexer=indexer, axis=0, allow_dups=True
+ )
def iget(self, i, fastpath=True):
"""
@@ -924,10 +978,13 @@ def iget(self, i, fastpath=True):
# fastpath shortcut for select a single-dim from a 2-dim BM
return SingleBlockManager(
- [block.make_block_same_class(values,
- placement=slice(0, len(values)),
- ndim=1)],
- self.axes[1])
+ [
+ block.make_block_same_class(
+ values, placement=slice(0, len(values)), ndim=1
+ )
+ ],
+ self.axes[1],
+ )
def delete(self, item):
"""
@@ -962,8 +1019,9 @@ def delete(self, item):
# FIXME: use Index.delete as soon as it uses fastpath=True
self.axes[0] = self.items[~is_deleted]
- self.blocks = tuple(b for blkno, b in enumerate(self.blocks)
- if not is_blk_deleted[blkno])
+ self.blocks = tuple(
+ b for blkno, b in enumerate(self.blocks) if not is_blk_deleted[blkno]
+ )
self._shape = None
self._rebuild_blknos_and_blklocs()
@@ -977,28 +1035,32 @@ def set(self, item, value):
# TODO(EA): Remove an is_extension_ when all extension types satisfy
# the interface
- value_is_extension_type = (is_extension_type(value) or
- is_extension_array_dtype(value))
+ value_is_extension_type = is_extension_type(value) or is_extension_array_dtype(
+ value
+ )
# categorical/sparse/datetimetz
if value_is_extension_type:
def value_getitem(placement):
return value
+
else:
if value.ndim == self.ndim - 1:
value = _safe_reshape(value, (1,) + value.shape)
def value_getitem(placement):
return value
+
else:
def value_getitem(placement):
return value[placement.indexer]
if value.shape[1:] != self.shape[1:]:
- raise AssertionError('Shape of new values must be compatible '
- 'with manager shape')
+ raise AssertionError(
+ "Shape of new values must be compatible " "with manager shape"
+ )
try:
loc = self.items.get_loc(item)
@@ -1016,9 +1078,9 @@ def value_getitem(placement):
unfit_mgr_locs = []
unfit_val_locs = []
removed_blknos = []
- for blkno, val_locs in libinternals.get_blkno_placements(blknos,
- self.nblocks,
- group=True):
+ for blkno, val_locs in libinternals.get_blkno_placements(
+ blknos, self.nblocks, group=True
+ ):
blk = self.blocks[blkno]
blk_locs = blklocs[val_locs.indexer]
if blk.should_store(value):
@@ -1042,12 +1104,13 @@ def value_getitem(placement):
new_blknos = np.empty(self.nblocks, dtype=np.int64)
new_blknos.fill(-1)
- new_blknos[~is_deleted] = np.arange(self.nblocks -
- len(removed_blknos))
- self._blknos = algos.take_1d(new_blknos, self._blknos, axis=0,
- allow_fill=False)
- self.blocks = tuple(blk for i, blk in enumerate(self.blocks)
- if i not in set(removed_blknos))
+ new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos))
+ self._blknos = algos.take_1d(
+ new_blknos, self._blknos, axis=0, allow_fill=False
+ )
+ self.blocks = tuple(
+ blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos)
+ )
if unfit_val_locs:
unfit_mgr_locs = np.concatenate(unfit_mgr_locs)
@@ -1058,12 +1121,15 @@ def value_getitem(placement):
# This code (ab-)uses the fact that sparse blocks contain only
# one item.
new_blocks.extend(
- make_block(values=value.copy(), ndim=self.ndim,
- placement=slice(mgr_loc, mgr_loc + 1))
- for mgr_loc in unfit_mgr_locs)
-
- self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) +
- len(self.blocks))
+ make_block(
+ values=value.copy(),
+ ndim=self.ndim,
+ placement=slice(mgr_loc, mgr_loc + 1),
+ )
+ for mgr_loc in unfit_mgr_locs
+ )
+
+ self._blknos[unfit_mgr_locs] = np.arange(unfit_count) + len(self.blocks)
self._blklocs[unfit_mgr_locs] = 0
else:
@@ -1071,8 +1137,12 @@ def value_getitem(placement):
unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
new_blocks.append(
- make_block(values=value_getitem(unfit_val_items),
- ndim=self.ndim, placement=unfit_mgr_locs))
+ make_block(
+ values=value_getitem(unfit_val_items),
+ ndim=self.ndim,
+ placement=unfit_mgr_locs,
+ )
+ )
self._blknos[unfit_mgr_locs] = len(self.blocks)
self._blklocs[unfit_mgr_locs] = np.arange(unfit_count)
@@ -1097,7 +1167,7 @@ def insert(self, loc, item, value, allow_duplicates=False):
"""
if not allow_duplicates and item in self.items:
# Should this be a different kind of error??
- raise ValueError('cannot insert {}, already exists'.format(item))
+ raise ValueError("cannot insert {}, already exists".format(item))
if not isinstance(loc, int):
raise TypeError("loc must be int")
@@ -1105,8 +1175,7 @@ def insert(self, loc, item, value, allow_duplicates=False):
# insert to the axis; this could possibly raise a TypeError
new_axis = self.items.insert(loc, item)
- block = make_block(values=value, ndim=self.ndim,
- placement=slice(loc, loc + 1))
+ block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1))
for blkno, count in _fast_count_smallints(self._blknos[loc:]):
blk = self.blocks[blkno]
@@ -1134,20 +1203,24 @@ def insert(self, loc, item, value, allow_duplicates=False):
if len(self.blocks) > 100:
self._consolidate_inplace()
- def reindex_axis(self, new_index, axis, method=None, limit=None,
- fill_value=None, copy=True):
+ def reindex_axis(
+ self, new_index, axis, method=None, limit=None, fill_value=None, copy=True
+ ):
"""
Conform block manager to new index.
"""
new_index = ensure_index(new_index)
- new_index, indexer = self.axes[axis].reindex(new_index, method=method,
- limit=limit)
+ new_index, indexer = self.axes[axis].reindex(
+ new_index, method=method, limit=limit
+ )
- return self.reindex_indexer(new_index, indexer, axis=axis,
- fill_value=fill_value, copy=copy)
+ return self.reindex_indexer(
+ new_index, indexer, axis=axis, fill_value=fill_value, copy=copy
+ )
- def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
- allow_dups=False, copy=True):
+ def reindex_indexer(
+ self, new_axis, indexer, axis, fill_value=None, allow_dups=False, copy=True
+ ):
"""
Parameters
----------
@@ -1178,12 +1251,18 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
raise IndexError("Requested axis not found in manager")
if axis == 0:
- new_blocks = self._slice_take_blocks_ax0(indexer,
- fill_tuple=(fill_value,))
+ new_blocks = self._slice_take_blocks_ax0(indexer, fill_tuple=(fill_value,))
else:
- new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=(
- fill_value if fill_value is not None else blk.fill_value,))
- for blk in self.blocks]
+ new_blocks = [
+ blk.take_nd(
+ indexer,
+ axis=axis,
+ fill_tuple=(
+ fill_value if fill_value is not None else blk.fill_value,
+ ),
+ )
+ for blk in self.blocks
+ ]
new_axes = list(self.axes)
new_axes[axis] = new_axis
@@ -1204,30 +1283,38 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
allow_fill = fill_tuple is not None
sl_type, slobj, sllen = _preprocess_slice_or_indexer(
- slice_or_indexer, self.shape[0], allow_fill=allow_fill)
+ slice_or_indexer, self.shape[0], allow_fill=allow_fill
+ )
if self._is_single_block:
blk = self.blocks[0]
- if sl_type in ('slice', 'mask'):
+ if sl_type in ("slice", "mask"):
return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))]
elif not allow_fill or self.ndim == 1:
if allow_fill and fill_tuple[0] is None:
_, fill_value = maybe_promote(blk.dtype)
fill_tuple = (fill_value,)
- return [blk.take_nd(slobj, axis=0,
- new_mgr_locs=slice(0, sllen),
- fill_tuple=fill_tuple)]
+ return [
+ blk.take_nd(
+ slobj,
+ axis=0,
+ new_mgr_locs=slice(0, sllen),
+ fill_tuple=fill_tuple,
+ )
+ ]
- if sl_type in ('slice', 'mask'):
+ if sl_type in ("slice", "mask"):
blknos = self._blknos[slobj]
blklocs = self._blklocs[slobj]
else:
- blknos = algos.take_1d(self._blknos, slobj, fill_value=-1,
- allow_fill=allow_fill)
- blklocs = algos.take_1d(self._blklocs, slobj, fill_value=-1,
- allow_fill=allow_fill)
+ blknos = algos.take_1d(
+ self._blknos, slobj, fill_value=-1, allow_fill=allow_fill
+ )
+ blklocs = algos.take_1d(
+ self._blklocs, slobj, fill_value=-1, allow_fill=allow_fill
+ )
# When filling blknos, make sure blknos is updated before appending to
# blocks list, that way new blkno is exactly len(blocks).
@@ -1235,15 +1322,16 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
# FIXME: mgr_groupby_blknos must return mgr_locs in ascending order,
# pytables serialization will break otherwise.
blocks = []
- for blkno, mgr_locs in libinternals.get_blkno_placements(blknos,
- self.nblocks,
- group=True):
+ for blkno, mgr_locs in libinternals.get_blkno_placements(
+ blknos, self.nblocks, group=True
+ ):
if blkno == -1:
# If we've got here, fill_tuple was not None.
fill_value = fill_tuple[0]
- blocks.append(self._make_na_block(placement=mgr_locs,
- fill_value=fill_value))
+ blocks.append(
+ self._make_na_block(placement=mgr_locs, fill_value=fill_value)
+ )
else:
blk = self.blocks[blkno]
@@ -1258,9 +1346,14 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
blocks.append(newblk)
else:
- blocks.append(blk.take_nd(blklocs[mgr_locs.indexer],
- axis=0, new_mgr_locs=mgr_locs,
- fill_tuple=None))
+ blocks.append(
+ blk.take_nd(
+ blklocs[mgr_locs.indexer],
+ axis=0,
+ new_mgr_locs=mgr_locs,
+ fill_tuple=None,
+ )
+ )
return blocks
@@ -1282,10 +1375,11 @@ def take(self, indexer, axis=1, verify=True, convert=True):
Take items along any axis.
"""
self._consolidate_inplace()
- indexer = (np.arange(indexer.start, indexer.stop, indexer.step,
- dtype='int64')
- if isinstance(indexer, slice)
- else np.asanyarray(indexer, dtype='int64'))
+ indexer = (
+ np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64")
+ if isinstance(indexer, slice)
+ else np.asanyarray(indexer, dtype="int64")
+ )
n = self.shape[axis]
if convert:
@@ -1293,12 +1387,14 @@ def take(self, indexer, axis=1, verify=True, convert=True):
if verify:
if ((indexer == -1) | (indexer >= n)).any():
- raise Exception('Indices must be nonzero and less than '
- 'the axis length')
+ raise Exception(
+ "Indices must be nonzero and less than " "the axis length"
+ )
new_labels = self.axes[axis].take(indexer)
- return self.reindex_indexer(new_axis=new_labels, indexer=indexer,
- axis=axis, allow_dups=True)
+ return self.reindex_indexer(
+ new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
+ )
def equals(self, other):
self_axes, other_axes = self.axes, other.axes
@@ -1320,8 +1416,9 @@ def canonicalize(block):
self_blocks = sorted(self.blocks, key=canonicalize)
other_blocks = sorted(other.blocks, key=canonicalize)
- return all(block.equals(oblock)
- for block, oblock in zip(self_blocks, other_blocks))
+ return all(
+ block.equals(oblock) for block, oblock in zip(self_blocks, other_blocks)
+ )
def unstack(self, unstacker_func, fill_value):
"""Return a blockmanager with all blocks unstacked.
@@ -1346,11 +1443,10 @@ def unstack(self, unstacker_func, fill_value):
for blk in self.blocks:
blocks, mask = blk._unstack(
- partial(unstacker_func,
- value_columns=self.items[blk.mgr_locs.indexer]),
+ partial(unstacker_func, value_columns=self.items[blk.mgr_locs.indexer]),
new_columns,
n_rows,
- fill_value
+ fill_value,
)
new_blocks.extend(blocks)
@@ -1370,15 +1466,18 @@ class SingleBlockManager(BlockManager):
_known_consolidated = True
__slots__ = ()
- def __init__(self,
- block: Block,
- axis: Union[Index, List[Index]],
- do_integrity_check: bool = False,
- fastpath: bool = False):
+ def __init__(
+ self,
+ block: Block,
+ axis: Union[Index, List[Index]],
+ do_integrity_check: bool = False,
+ fastpath: bool = False,
+ ):
if isinstance(axis, list):
if len(axis) != 1:
- raise ValueError("cannot create SingleBlockManager with more "
- "than 1 axis")
+ raise ValueError(
+ "cannot create SingleBlockManager with more " "than 1 axis"
+ )
axis = axis[0]
# passed from constructor, single block, single axis
@@ -1390,8 +1489,9 @@ def __init__(self,
if len(block) == 0:
block = [np.array([])]
elif len(block) != 1:
- raise ValueError('Cannot create SingleBlockManager with '
- 'more than 1 block')
+ raise ValueError(
+ "Cannot create SingleBlockManager with " "more than 1 block"
+ )
block = block[0]
else:
self.axes = [ensure_index(axis)]
@@ -1406,8 +1506,9 @@ def __init__(self,
block = _consolidate(block)
if len(block) != 1:
- raise ValueError('Cannot create SingleBlockManager with '
- 'more than 1 block')
+ raise ValueError(
+ "Cannot create SingleBlockManager with " "more than 1 block"
+ )
block = block[0]
if not isinstance(block, Block):
@@ -1440,8 +1541,9 @@ def get_slice(self, slobj, axis=0):
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")
- return self.__class__(self._block._slice(slobj),
- self.index[slobj], fastpath=True)
+ return self.__class__(
+ self._block._slice(slobj), self.index[slobj], fastpath=True
+ )
@property
def index(self):
@@ -1449,8 +1551,8 @@ def index(self):
def convert(self, **kwargs):
""" convert the whole block as one """
- kwargs['by_item'] = False
- return self.apply('convert', **kwargs)
+ kwargs["by_item"] = False
+ return self.apply("convert", **kwargs)
@property
def dtype(self):
@@ -1547,13 +1649,11 @@ def concat(self, to_concat, new_axis):
else:
values = [x.values for x in blocks]
values = _concat._concat_compat(values)
- new_block = make_block(
- values, placement=slice(0, len(values), 1))
+ new_block = make_block(values, placement=slice(0, len(values), 1))
else:
values = [x._block.values for x in to_concat]
values = _concat._concat_compat(values)
- new_block = make_block(
- values, placement=slice(0, len(values), 1))
+ new_block = make_block(values, placement=slice(0, len(values), 1))
mgr = SingleBlockManager(new_block, new_axis)
return mgr
@@ -1562,6 +1662,7 @@ def concat(self, to_concat, new_axis):
# --------------------------------------------------------------------
# Constructor Helpers
+
def create_block_manager_from_blocks(blocks, axes):
try:
if len(blocks) == 1 and not isinstance(blocks[0], Block):
@@ -1572,15 +1673,16 @@ def create_block_manager_from_blocks(blocks, axes):
# It's OK if a single block is passed as values, its placement
# is basically "all items", but if there're many, don't bother
# converting, it's an error anyway.
- blocks = [make_block(values=blocks[0],
- placement=slice(0, len(axes[0])))]
+ blocks = [
+ make_block(values=blocks[0], placement=slice(0, len(axes[0])))
+ ]
mgr = BlockManager(blocks, axes)
mgr._consolidate_inplace()
return mgr
except ValueError as e:
- blocks = [getattr(b, 'values', b) for b in blocks]
+ blocks = [getattr(b, "values", b) for b in blocks]
tot_items = sum(b.shape[0] for b in blocks)
construction_error(tot_items, blocks[0].shape[1:], axes, e)
@@ -1612,12 +1714,14 @@ def construction_error(tot_items, block_shape, axes, e=None):
raise e
if block_shape[0] == 0:
raise ValueError("Empty data passed with indices specified.")
- raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
- passed, implied))
+ raise ValueError(
+ "Shape of passed values is {0}, indices imply {1}".format(passed, implied)
+ )
# -----------------------------------------------------------------------
+
def form_blocks(arrays, names, axes):
# put "leftover" items in float bucket, where else?
# generalize?
@@ -1643,60 +1747,61 @@ def form_blocks(arrays, names, axes):
items_dict[block_type.__name__].append((i, k, v))
blocks = []
- if len(items_dict['FloatBlock']):
- float_blocks = _multi_blockify(items_dict['FloatBlock'])
+ if len(items_dict["FloatBlock"]):
+ float_blocks = _multi_blockify(items_dict["FloatBlock"])
blocks.extend(float_blocks)
- if len(items_dict['ComplexBlock']):
- complex_blocks = _multi_blockify(items_dict['ComplexBlock'])
+ if len(items_dict["ComplexBlock"]):
+ complex_blocks = _multi_blockify(items_dict["ComplexBlock"])
blocks.extend(complex_blocks)
- if len(items_dict['TimeDeltaBlock']):
- timedelta_blocks = _multi_blockify(items_dict['TimeDeltaBlock'])
+ if len(items_dict["TimeDeltaBlock"]):
+ timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"])
blocks.extend(timedelta_blocks)
- if len(items_dict['IntBlock']):
- int_blocks = _multi_blockify(items_dict['IntBlock'])
+ if len(items_dict["IntBlock"]):
+ int_blocks = _multi_blockify(items_dict["IntBlock"])
blocks.extend(int_blocks)
- if len(items_dict['DatetimeBlock']):
- datetime_blocks = _simple_blockify(items_dict['DatetimeBlock'],
- _NS_DTYPE)
+ if len(items_dict["DatetimeBlock"]):
+ datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], _NS_DTYPE)
blocks.extend(datetime_blocks)
- if len(items_dict['DatetimeTZBlock']):
- dttz_blocks = [make_block(array,
- klass=DatetimeTZBlock,
- placement=[i])
- for i, _, array in items_dict['DatetimeTZBlock']]
+ if len(items_dict["DatetimeTZBlock"]):
+ dttz_blocks = [
+ make_block(array, klass=DatetimeTZBlock, placement=[i])
+ for i, _, array in items_dict["DatetimeTZBlock"]
+ ]
blocks.extend(dttz_blocks)
- if len(items_dict['BoolBlock']):
- bool_blocks = _simple_blockify(items_dict['BoolBlock'], np.bool_)
+ if len(items_dict["BoolBlock"]):
+ bool_blocks = _simple_blockify(items_dict["BoolBlock"], np.bool_)
blocks.extend(bool_blocks)
- if len(items_dict['ObjectBlock']) > 0:
- object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_)
+ if len(items_dict["ObjectBlock"]) > 0:
+ object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_)
blocks.extend(object_blocks)
- if len(items_dict['CategoricalBlock']) > 0:
- cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i])
- for i, _, array in items_dict['CategoricalBlock']]
+ if len(items_dict["CategoricalBlock"]) > 0:
+ cat_blocks = [
+ make_block(array, klass=CategoricalBlock, placement=[i])
+ for i, _, array in items_dict["CategoricalBlock"]
+ ]
blocks.extend(cat_blocks)
- if len(items_dict['ExtensionBlock']):
+ if len(items_dict["ExtensionBlock"]):
external_blocks = [
make_block(array, klass=ExtensionBlock, placement=[i])
- for i, _, array in items_dict['ExtensionBlock']
+ for i, _, array in items_dict["ExtensionBlock"]
]
blocks.extend(external_blocks)
- if len(items_dict['ObjectValuesExtensionBlock']):
+ if len(items_dict["ObjectValuesExtensionBlock"]):
external_blocks = [
make_block(array, klass=ObjectValuesExtensionBlock, placement=[i])
- for i, _, array in items_dict['ObjectValuesExtensionBlock']
+ for i, _, array in items_dict["ObjectValuesExtensionBlock"]
]
blocks.extend(external_blocks)
@@ -1756,7 +1861,7 @@ def _asarray_compat(x):
def _shape_compat(x):
if isinstance(x, ABCSeries):
- return len(x),
+ return (len(x),)
else:
return x.shape
@@ -1773,7 +1878,8 @@ def _shape_compat(x):
def _interleaved_dtype(
- blocks: List[Block]) -> Optional[Union[np.dtype, ExtensionDtype]]:
+ blocks: List[Block]
+) -> Optional[Union[np.dtype, ExtensionDtype]]:
"""Find the common dtype for `blocks`.
Parameters
@@ -1802,8 +1908,9 @@ def _consolidate(blocks):
new_blocks = []
for (_can_consolidate, dtype), group_blocks in grouper:
- merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype,
- _can_consolidate=_can_consolidate)
+ merged_blocks = _merge_blocks(
+ list(group_blocks), dtype=dtype, _can_consolidate=_can_consolidate
+ )
new_blocks = _extend_blocks(merged_blocks, new_blocks)
return new_blocks
@@ -1828,8 +1935,9 @@ def _compare_or_regex_search(a, b, regex=False):
if not regex:
op = lambda x: operator.eq(x, b)
else:
- op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance(x, str)
- else False)
+ op = np.vectorize(
+ lambda x: bool(re.search(b, x)) if isinstance(x, str) else False
+ )
is_a_array = isinstance(a, np.ndarray)
is_b_array = isinstance(b, np.ndarray)
@@ -1848,14 +1956,16 @@ def _compare_or_regex_search(a, b, regex=False):
type_names = [type(a).__name__, type(b).__name__]
if is_a_array:
- type_names[0] = 'ndarray(dtype={dtype})'.format(dtype=a.dtype)
+ type_names[0] = "ndarray(dtype={dtype})".format(dtype=a.dtype)
if is_b_array:
- type_names[1] = 'ndarray(dtype={dtype})'.format(dtype=b.dtype)
+ type_names[1] = "ndarray(dtype={dtype})".format(dtype=b.dtype)
raise TypeError(
- "Cannot compare types {a!r} and {b!r}".format(a=type_names[0],
- b=type_names[1]))
+ "Cannot compare types {a!r} and {b!r}".format(
+ a=type_names[0], b=type_names[1]
+ )
+ )
return result
@@ -1869,8 +1979,10 @@ def _transform_index(index, func, level=None):
"""
if isinstance(index, MultiIndex):
if level is not None:
- items = [tuple(func(y) if i == level else y
- for i, y in enumerate(x)) for x in index]
+ items = [
+ tuple(func(y) if i == level else y for i, y in enumerate(x))
+ for x in index
+ ]
else:
items = [tuple(func(y) for y in x) for x in index]
return MultiIndex.from_tuples(items, names=index.names)
@@ -1888,16 +2000,20 @@ def _fast_count_smallints(arr):
def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill):
if isinstance(slice_or_indexer, slice):
- return ('slice', slice_or_indexer,
- libinternals.slice_len(slice_or_indexer, length))
- elif (isinstance(slice_or_indexer, np.ndarray) and
- slice_or_indexer.dtype == np.bool_):
- return 'mask', slice_or_indexer, slice_or_indexer.sum()
+ return (
+ "slice",
+ slice_or_indexer,
+ libinternals.slice_len(slice_or_indexer, length),
+ )
+ elif (
+ isinstance(slice_or_indexer, np.ndarray) and slice_or_indexer.dtype == np.bool_
+ ):
+ return "mask", slice_or_indexer, slice_or_indexer.sum()
else:
indexer = np.asanyarray(slice_or_indexer, dtype=np.int64)
if not allow_fill:
indexer = maybe_convert_indices(indexer, length)
- return 'fancy', indexer, len(indexer)
+ return "fancy", indexer, len(indexer)
def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
@@ -1912,8 +2028,9 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
copy : bool
"""
- concat_plans = [get_mgr_concatenation_plan(mgr, indexers)
- for mgr, indexers in mgrs_indexers]
+ concat_plans = [
+ get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
+ ]
concat_plan = combine_concat_plans(concat_plans, concat_axis)
blocks = []
@@ -1929,11 +2046,13 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
b = b.make_block_same_class(values, placement=placement)
elif is_uniform_join_units(join_units):
b = join_units[0].block.concat_same_type(
- [ju.block for ju in join_units], placement=placement)
+ [ju.block for ju in join_units], placement=placement
+ )
else:
b = make_block(
concatenate_join_units(join_units, concat_axis, copy=copy),
- placement=placement)
+ placement=placement,
+ )
blocks.append(b)
return BlockManager(blocks, axes)
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index 4230b212f567a..ad4b5e4523806 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -10,9 +10,17 @@
from pandas.core.dtypes.cast import infer_dtype_from_array
from pandas.core.dtypes.common import (
- ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, is_float_dtype,
- is_integer, is_integer_dtype, is_numeric_v_string_like, is_scalar,
- is_timedelta64_dtype, needs_i8_conversion)
+ ensure_float64,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_float_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_numeric_v_string_like,
+ is_scalar,
+ is_timedelta64_dtype,
+ needs_i8_conversion,
+)
from pandas.core.dtypes.missing import isna
@@ -69,47 +77,73 @@ def mask_missing(arr, values_to_mask):
def clean_fill_method(method, allow_nearest=False):
# asfreq is compat for resampling
- if method in [None, 'asfreq']:
+ if method in [None, "asfreq"]:
return None
if isinstance(method, str):
method = method.lower()
- if method == 'ffill':
- method = 'pad'
- elif method == 'bfill':
- method = 'backfill'
+ if method == "ffill":
+ method = "pad"
+ elif method == "bfill":
+ method = "backfill"
- valid_methods = ['pad', 'backfill']
- expecting = 'pad (ffill) or backfill (bfill)'
+ valid_methods = ["pad", "backfill"]
+ expecting = "pad (ffill) or backfill (bfill)"
if allow_nearest:
- valid_methods.append('nearest')
- expecting = 'pad (ffill), backfill (bfill) or nearest'
+ valid_methods.append("nearest")
+ expecting = "pad (ffill), backfill (bfill) or nearest"
if method not in valid_methods:
- msg = ('Invalid fill method. Expecting {expecting}. Got {method}'
- .format(expecting=expecting, method=method))
+ msg = "Invalid fill method. Expecting {expecting}. Got {method}".format(
+ expecting=expecting, method=method
+ )
raise ValueError(msg)
return method
def clean_interp_method(method, **kwargs):
- order = kwargs.get('order')
- valid = ['linear', 'time', 'index', 'values', 'nearest', 'zero', 'slinear',
- 'quadratic', 'cubic', 'barycentric', 'polynomial', 'krogh',
- 'piecewise_polynomial', 'pchip', 'akima', 'spline',
- 'from_derivatives']
- if method in ('spline', 'polynomial') and order is None:
- raise ValueError("You must specify the order of the spline or "
- "polynomial.")
+ order = kwargs.get("order")
+ valid = [
+ "linear",
+ "time",
+ "index",
+ "values",
+ "nearest",
+ "zero",
+ "slinear",
+ "quadratic",
+ "cubic",
+ "barycentric",
+ "polynomial",
+ "krogh",
+ "piecewise_polynomial",
+ "pchip",
+ "akima",
+ "spline",
+ "from_derivatives",
+ ]
+ if method in ("spline", "polynomial") and order is None:
+ raise ValueError("You must specify the order of the spline or " "polynomial.")
if method not in valid:
- raise ValueError("method must be one of {valid}. Got '{method}' "
- "instead.".format(valid=valid, method=method))
+ raise ValueError(
+ "method must be one of {valid}. Got '{method}' "
+ "instead.".format(valid=valid, method=method)
+ )
return method
-def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
- limit_direction='forward', limit_area=None, fill_value=None,
- bounds_error=False, order=None, **kwargs):
+def interpolate_1d(
+ xvalues,
+ yvalues,
+ method="linear",
+ limit=None,
+ limit_direction="forward",
+ limit_area=None,
+ fill_value=None,
+ bounds_error=False,
+ order=None,
+ **kwargs
+):
"""
Logic for the 1-d interpolation. The result should be 1-d, inputs
xvalues and yvalues will each be 1-d arrays of the same length.
@@ -132,39 +166,44 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
if valid.all():
return yvalues
- if method == 'time':
- if not getattr(xvalues, 'is_all_dates', None):
+ if method == "time":
+ if not getattr(xvalues, "is_all_dates", None):
# if not issubclass(xvalues.dtype.type, np.datetime64):
- raise ValueError('time-weighted interpolation only works '
- 'on Series or DataFrames with a '
- 'DatetimeIndex')
- method = 'values'
-
- valid_limit_directions = ['forward', 'backward', 'both']
+ raise ValueError(
+ "time-weighted interpolation only works "
+ "on Series or DataFrames with a "
+ "DatetimeIndex"
+ )
+ method = "values"
+
+ valid_limit_directions = ["forward", "backward", "both"]
limit_direction = limit_direction.lower()
if limit_direction not in valid_limit_directions:
- msg = ('Invalid limit_direction: expecting one of {valid!r}, '
- 'got {invalid!r}.')
- raise ValueError(msg.format(valid=valid_limit_directions,
- invalid=limit_direction))
+ msg = "Invalid limit_direction: expecting one of {valid!r}, " "got {invalid!r}."
+ raise ValueError(
+ msg.format(valid=valid_limit_directions, invalid=limit_direction)
+ )
if limit_area is not None:
- valid_limit_areas = ['inside', 'outside']
+ valid_limit_areas = ["inside", "outside"]
limit_area = limit_area.lower()
if limit_area not in valid_limit_areas:
- raise ValueError('Invalid limit_area: expecting one of {}, got '
- '{}.'.format(valid_limit_areas, limit_area))
+ raise ValueError(
+ "Invalid limit_area: expecting one of {}, got "
+ "{}.".format(valid_limit_areas, limit_area)
+ )
# default limit is unlimited GH #16282
if limit is None:
# limit = len(xvalues)
pass
elif not is_integer(limit):
- raise ValueError('Limit must be an integer')
+ raise ValueError("Limit must be an integer")
elif limit < 1:
- raise ValueError('Limit must be greater than 0')
+ raise ValueError("Limit must be greater than 0")
from pandas import Series
+
ys = Series(yvalues)
# These are sets of index pointers to invalid values... i.e. {0, 1, etc...
@@ -182,9 +221,9 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
# are more than'limit' away from the prior non-NaN.
# set preserve_nans based on direction using _interp_limit
- if limit_direction == 'forward':
+ if limit_direction == "forward":
preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
- elif limit_direction == 'backward':
+ elif limit_direction == "backward":
preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
else:
# both directions... just use _interp_limit
@@ -192,22 +231,22 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
# if limit_area is set, add either mid or outside indices
# to preserve_nans GH #16284
- if limit_area == 'inside':
+ if limit_area == "inside":
# preserve NaNs on the outside
preserve_nans |= start_nans | end_nans
- elif limit_area == 'outside':
+ elif limit_area == "outside":
# preserve NaNs on the inside
preserve_nans |= mid_nans
# sort preserve_nans and covert to list
preserve_nans = sorted(preserve_nans)
- xvalues = getattr(xvalues, 'values', xvalues)
- yvalues = getattr(yvalues, 'values', yvalues)
+ xvalues = getattr(xvalues, "values", xvalues)
+ yvalues = getattr(yvalues, "values", yvalues)
result = yvalues.copy()
- if method in ['linear', 'time', 'index', 'values']:
- if method in ('values', 'index'):
+ if method in ["linear", "time", "index", "values"]:
+ if method in ("values", "index"):
inds = np.asarray(xvalues)
# hack for DatetimeIndex, #1646
if needs_i8_conversion(inds.dtype.type):
@@ -220,73 +259,99 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
result[preserve_nans] = np.nan
return result
- sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
- 'barycentric', 'krogh', 'spline', 'polynomial',
- 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima']
+ sp_methods = [
+ "nearest",
+ "zero",
+ "slinear",
+ "quadratic",
+ "cubic",
+ "barycentric",
+ "krogh",
+ "spline",
+ "polynomial",
+ "from_derivatives",
+ "piecewise_polynomial",
+ "pchip",
+ "akima",
+ ]
if method in sp_methods:
inds = np.asarray(xvalues)
# hack for DatetimeIndex, #1646
if issubclass(inds.dtype.type, np.datetime64):
inds = inds.view(np.int64)
- result[invalid] = _interpolate_scipy_wrapper(inds[valid],
- yvalues[valid],
- inds[invalid],
- method=method,
- fill_value=fill_value,
- bounds_error=bounds_error,
- order=order, **kwargs)
+ result[invalid] = _interpolate_scipy_wrapper(
+ inds[valid],
+ yvalues[valid],
+ inds[invalid],
+ method=method,
+ fill_value=fill_value,
+ bounds_error=bounds_error,
+ order=order,
+ **kwargs
+ )
result[preserve_nans] = np.nan
return result
-def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None,
- bounds_error=False, order=None, **kwargs):
+def _interpolate_scipy_wrapper(
+ x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs
+):
"""
Passed off to scipy.interpolate.interp1d. method is scipy's kind.
Returns an array interpolated at new_x. Add any new methods to
the list in _clean_interp_method.
"""
- extra = '{method} interpolation requires SciPy.'.format(method=method)
- import_optional_dependency('scipy', extra=extra)
+ extra = "{method} interpolation requires SciPy.".format(method=method)
+ import_optional_dependency("scipy", extra=extra)
from scipy import interpolate
new_x = np.asarray(new_x)
# ignores some kwargs that could be passed along.
alt_methods = {
- 'barycentric': interpolate.barycentric_interpolate,
- 'krogh': interpolate.krogh_interpolate,
- 'from_derivatives': _from_derivatives,
- 'piecewise_polynomial': _from_derivatives,
+ "barycentric": interpolate.barycentric_interpolate,
+ "krogh": interpolate.krogh_interpolate,
+ "from_derivatives": _from_derivatives,
+ "piecewise_polynomial": _from_derivatives,
}
- if getattr(x, 'is_all_dates', False):
+ if getattr(x, "is_all_dates", False):
# GH 5975, scipy.interp1d can't hande datetime64s
- x, new_x = x._values.astype('i8'), new_x.astype('i8')
+ x, new_x = x._values.astype("i8"), new_x.astype("i8")
- if method == 'pchip':
+ if method == "pchip":
try:
- alt_methods['pchip'] = interpolate.pchip_interpolate
+ alt_methods["pchip"] = interpolate.pchip_interpolate
except AttributeError:
- raise ImportError("Your version of Scipy does not support "
- "PCHIP interpolation.")
- elif method == 'akima':
- alt_methods['akima'] = _akima_interpolate
-
- interp1d_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
- 'polynomial']
+ raise ImportError(
+ "Your version of Scipy does not support " "PCHIP interpolation."
+ )
+ elif method == "akima":
+ alt_methods["akima"] = _akima_interpolate
+
+ interp1d_methods = [
+ "nearest",
+ "zero",
+ "slinear",
+ "quadratic",
+ "cubic",
+ "polynomial",
+ ]
if method in interp1d_methods:
- if method == 'polynomial':
+ if method == "polynomial":
method = order
- terp = interpolate.interp1d(x, y, kind=method, fill_value=fill_value,
- bounds_error=bounds_error)
+ terp = interpolate.interp1d(
+ x, y, kind=method, fill_value=fill_value, bounds_error=bounds_error
+ )
new_y = terp(new_x)
- elif method == 'spline':
+ elif method == "spline":
# GH #10633, #24014
if isna(order) or (order <= 0):
- raise ValueError("order needs to be specified and greater than 0; "
- "got order: {}".format(order))
+ raise ValueError(
+ "order needs to be specified and greater than 0; "
+ "got order: {}".format(order)
+ )
terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs)
new_y = terp(new_x)
else:
@@ -341,8 +406,7 @@ def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False):
# return the method for compat with scipy version & backwards compat
method = interpolate.BPoly.from_derivatives
- m = method(xi, yi.reshape(-1, 1),
- orders=order, extrapolate=extrapolate)
+ m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate)
return m(x)
@@ -384,6 +448,7 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0):
"""
from scipy import interpolate
+
P = interpolate.Akima1DInterpolator(xi, yi, axis=axis)
if der == 0:
@@ -394,8 +459,9 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0):
return [P(x, nu) for nu in der]
-def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None,
- dtype=None):
+def interpolate_2d(
+ values, method="pad", axis=0, limit=None, fill_value=None, dtype=None
+):
"""
Perform an actual interpolation of values, values will be make 2-d if
needed fills inplace, returns the result.
@@ -407,8 +473,7 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None,
ndim = values.ndim
if values.ndim == 1:
if axis != 0: # pragma: no cover
- raise AssertionError("cannot interpolate on a ndim == 1 with "
- "axis != 0")
+ raise AssertionError("cannot interpolate on a ndim == 1 with " "axis != 0")
values = values.reshape(tuple((1,) + values.shape))
if fill_value is None:
@@ -417,12 +482,12 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None,
mask = mask_missing(transf(values), fill_value)
method = clean_fill_method(method)
- if method == 'pad':
- values = transf(pad_2d(
- transf(values), limit=limit, mask=mask, dtype=dtype))
+ if method == "pad":
+ values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype))
else:
- values = transf(backfill_2d(
- transf(values), limit=limit, mask=mask, dtype=dtype))
+ values = transf(
+ backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)
+ )
# reshape back
if ndim == 1:
@@ -438,8 +503,11 @@ def _cast_values_for_fillna(values, dtype):
# TODO: for int-dtypes we make a copy, but for everything else this
# alters the values in-place. Is this intentional?
- if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or
- is_timedelta64_dtype(dtype)):
+ if (
+ is_datetime64_dtype(dtype)
+ or is_datetime64tz_dtype(dtype)
+ or is_timedelta64_dtype(dtype)
+ ):
values = values.view(np.int64)
elif is_integer_dtype(values):
@@ -498,7 +566,7 @@ def backfill_2d(values, limit=None, mask=None, dtype=None):
return values
-_fill_methods = {'pad': pad_1d, 'backfill': backfill_1d}
+_fill_methods = {"pad": pad_1d, "backfill": backfill_1d}
def get_fill_func(method):
@@ -523,10 +591,10 @@ def fill_zeros(result, x, y, name, fill):
if fill is None or is_float_dtype(result):
return result
- if name.startswith(('r', '__r')):
+ if name.startswith(("r", "__r")):
x, y = y, x
- is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type'))
+ is_variable_type = hasattr(y, "dtype") or hasattr(y, "type")
is_scalar_type = is_scalar(y)
if not is_variable_type and not is_scalar_type:
@@ -544,15 +612,15 @@ def fill_zeros(result, x, y, name, fill):
mask = ((y == 0) & ~np.isnan(result)).ravel()
shape = result.shape
- result = result.astype('float64', copy=False).ravel()
+ result = result.astype("float64", copy=False).ravel()
np.putmask(result, mask, fill)
# if we have a fill of inf, then sign it correctly
# (GH 6178 and PR 9308)
if np.isinf(fill):
- signs = y if name.startswith(('r', '__r')) else x
- signs = np.sign(signs.astype('float', copy=False))
+ signs = y if name.startswith(("r", "__r")) else x
+ signs = np.sign(signs.astype("float", copy=False))
negative_inf_mask = (signs.ravel() < 0) & mask
np.putmask(result, negative_inf_mask, -fill)
@@ -606,7 +674,7 @@ def mask_zero_div_zero(x, y, result, copy=False):
if nan_mask.any() or neginf_mask.any() or posinf_mask.any():
# Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN
- result = result.astype('float64', copy=copy).ravel()
+ result = result.astype("float64", copy=copy).ravel()
np.putmask(result, nan_mask, np.nan)
np.putmask(result, posinf_mask, np.inf)
@@ -633,9 +701,8 @@ def dispatch_missing(op, left, right, result):
-------
result : ndarray
"""
- opstr = '__{opname}__'.format(opname=op.__name__).replace('____', '__')
- if op in [operator.truediv, operator.floordiv,
- getattr(operator, 'div', None)]:
+ opstr = "__{opname}__".format(opname=op.__name__).replace("____", "__")
+ if op in [operator.truediv, operator.floordiv, getattr(operator, "div", None)]:
result = mask_zero_div_zero(left, right, result)
elif op is operator.mod:
result = fill_zeros(result, left, right, opstr, np.nan)
@@ -684,8 +751,9 @@ def _interp_limit(invalid, fw_limit, bw_limit):
def inner(invalid, limit):
limit = min(limit, N)
windowed = _rolling_window(invalid, limit + 1).all(1)
- idx = (set(np.where(windowed)[0] + limit) |
- set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0]))
+ idx = set(np.where(windowed)[0] + limit) | set(
+ np.where((~invalid[: limit + 1]).cumsum() == 0)[0]
+ )
return idx
if fw_limit is not None:
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index cc8b241bedba1..ce14cb22a88ce 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -12,18 +12,30 @@
from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
from pandas.core.dtypes.common import (
- _get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, is_complex_dtype,
- is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype,
- is_float, is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype,
- is_object_dtype, is_scalar, is_timedelta64_dtype, pandas_dtype)
+ _get_dtype,
+ is_any_int_dtype,
+ is_bool_dtype,
+ is_complex,
+ is_complex_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_datetime_or_timedelta_dtype,
+ is_float,
+ is_float_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_numeric_dtype,
+ is_object_dtype,
+ is_scalar,
+ is_timedelta64_dtype,
+ pandas_dtype,
+)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna
import pandas.core.common as com
-bn = import_optional_dependency("bottleneck",
- raise_on_missing=False,
- on_version="warn")
+bn = import_optional_dependency("bottleneck", raise_on_missing=False, on_version="warn")
_BOTTLENECK_INSTALLED = bn is not None
_USE_BOTTLENECK = False
@@ -35,28 +47,26 @@ def set_use_bottleneck(v=True):
_USE_BOTTLENECK = v
-set_use_bottleneck(get_option('compute.use_bottleneck'))
+set_use_bottleneck(get_option("compute.use_bottleneck"))
class disallow:
-
def __init__(self, *dtypes):
super().__init__()
self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
def check(self, obj):
- return hasattr(obj, 'dtype') and issubclass(obj.dtype.type,
- self.dtypes)
+ return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes)
def __call__(self, f):
@functools.wraps(f)
def _f(*args, **kwargs):
obj_iter = itertools.chain(args, kwargs.values())
if any(self.check(obj) for obj in obj_iter):
- msg = 'reduction operation {name!r} not allowed for this dtype'
- raise TypeError(msg.format(name=f.__name__.replace('nan', '')))
+ msg = "reduction operation {name!r} not allowed for this dtype"
+ raise TypeError(msg.format(name=f.__name__.replace("nan", "")))
try:
- with np.errstate(invalid='ignore'):
+ with np.errstate(invalid="ignore"):
return f(*args, **kwargs)
except ValueError as e:
# we want to transform an object array
@@ -71,7 +81,6 @@ def _f(*args, **kwargs):
class bottleneck_switch:
-
def __init__(self, name=None, **kwargs):
self.name = name
self.kwargs = kwargs
@@ -91,7 +100,7 @@ def f(values, axis=None, skipna=True, **kwds):
if k not in kwds:
kwds[k] = v
try:
- if values.size == 0 and kwds.get('min_count') is None:
+ if values.size == 0 and kwds.get("min_count") is None:
# We are empty, returning NA for our type
# Only applies for the default `min_count` of None
# since that affects how empty arrays are handled.
@@ -100,8 +109,7 @@ def f(values, axis=None, skipna=True, **kwds):
# It *may* just be `var`
return _na_for_min_count(values, axis)
- if (_USE_BOTTLENECK and skipna and
- _bn_ok_dtype(values.dtype, bn_name)):
+ if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name):
result = bn_func(values, axis=axis, **kwds)
# prefer to treat inf/-inf as NA, but must compute the func
@@ -130,9 +138,9 @@ def f(values, axis=None, skipna=True, **kwds):
def _bn_ok_dtype(dt, name):
# Bottleneck chokes on datetime64
- if (not is_object_dtype(dt) and
- not (is_datetime_or_timedelta_dtype(dt) or
- is_datetime64tz_dtype(dt))):
+ if not is_object_dtype(dt) and not (
+ is_datetime_or_timedelta_dtype(dt) or is_datetime64tz_dtype(dt)
+ ):
# GH 15507
# bottleneck does not properly upcast during the sum
@@ -142,7 +150,7 @@ def _bn_ok_dtype(dt, name):
# further we also want to preserve NaN when all elements
# are NaN, unlinke bottleneck/numpy which consider this
# to be 0
- if name in ['nansum', 'nanprod']:
+ if name in ["nansum", "nanprod"]:
return False
return True
@@ -151,9 +159,9 @@ def _bn_ok_dtype(dt, name):
def _has_infs(result):
if isinstance(result, np.ndarray):
- if result.dtype == 'f8':
+ if result.dtype == "f8":
return lib.has_infs_f8(result.ravel())
- elif result.dtype == 'f4':
+ elif result.dtype == "f4":
return lib.has_infs_f4(result.ravel())
try:
return np.isinf(result).any()
@@ -170,7 +178,7 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
if fill_value_typ is None:
return np.nan
else:
- if fill_value_typ == '+inf':
+ if fill_value_typ == "+inf":
return np.inf
else:
return -np.inf
@@ -178,15 +186,16 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
if fill_value_typ is None:
return tslibs.iNaT
else:
- if fill_value_typ == '+inf':
+ if fill_value_typ == "+inf":
# need the max int here
return _int64_max
else:
return tslibs.iNaT
-def _maybe_get_mask(values: np.ndarray, skipna: bool,
- mask: Optional[np.ndarray]) -> Optional[np.ndarray]:
+def _maybe_get_mask(
+ values: np.ndarray, skipna: bool, mask: Optional[np.ndarray]
+) -> Optional[np.ndarray]:
""" This function will compute a mask iff it is necessary. Otherwise,
return the provided mask (potentially None) when a mask does not need to be
computed.
@@ -227,11 +236,13 @@ def _maybe_get_mask(values: np.ndarray, skipna: bool,
return mask
-def _get_values(values: np.ndarray, skipna: bool, fill_value: Any = None,
- fill_value_typ: Optional[str] = None,
- mask: Optional[np.ndarray] = None
- ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype,
- np.dtype, Any]:
+def _get_values(
+ values: np.ndarray,
+ skipna: bool,
+ fill_value: Any = None,
+ fill_value_typ: Optional[str] = None,
+ mask: Optional[np.ndarray] = None,
+) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]:
""" Utility to get the values view, mask, dtype, dtype_max, and fill_value.
If both mask and fill_value/fill_value_typ are not None and skipna is True,
@@ -288,8 +299,9 @@ def _get_values(values: np.ndarray, skipna: bool, fill_value: Any = None,
# get our fill value (in case we need to provide an alternative
# dtype for it)
- fill_value = _get_fill_value(dtype, fill_value=fill_value,
- fill_value_typ=fill_value_typ)
+ fill_value = _get_fill_value(
+ dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
+ )
copy = (mask is not None) and (fill_value is not None)
@@ -315,16 +327,19 @@ def _get_values(values: np.ndarray, skipna: bool, fill_value: Any = None,
def _isfinite(values):
if is_datetime_or_timedelta_dtype(values):
return isna(values)
- if (is_complex_dtype(values) or is_float_dtype(values) or
- is_integer_dtype(values) or is_bool_dtype(values)):
+ if (
+ is_complex_dtype(values)
+ or is_float_dtype(values)
+ or is_integer_dtype(values)
+ or is_bool_dtype(values)
+ ):
return ~np.isfinite(values)
- return ~np.isfinite(values.astype('float64'))
+ return ~np.isfinite(values.astype("float64"))
def _na_ok_dtype(dtype):
# TODO: what about datetime64tz? PeriodDtype?
- return not issubclass(dtype.type,
- (np.integer, np.timedelta64, np.datetime64))
+ return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64))
def _wrap_results(result, dtype, fill_value=None):
@@ -335,7 +350,7 @@ def _wrap_results(result, dtype, fill_value=None):
# GH#24293
fill_value = iNaT
if not isinstance(result, np.ndarray):
- tz = getattr(dtype, 'tz', None)
+ tz = getattr(dtype, "tz", None)
assert not isna(fill_value), "Expected non-null fill_value"
if result == fill_value:
result = np.nan
@@ -351,9 +366,9 @@ def _wrap_results(result, dtype, fill_value=None):
if np.fabs(result) > _int64_max:
raise ValueError("overflow in timedelta operation")
- result = tslibs.Timedelta(result, unit='ns')
+ result = tslibs.Timedelta(result, unit="ns")
else:
- result = result.astype('i8').view(dtype)
+ result = result.astype("i8").view(dtype)
return result
@@ -375,14 +390,13 @@ def _na_for_min_count(values, axis):
"""
# we either return np.nan or pd.NaT
if is_numeric_dtype(values):
- values = values.astype('float64')
+ values = values.astype("float64")
fill_value = na_value_for_dtype(values.dtype)
if values.ndim == 1:
return fill_value
else:
- result_shape = (values.shape[:axis] +
- values.shape[axis + 1:])
+ result_shape = values.shape[:axis] + values.shape[axis + 1 :]
result = np.empty(result_shape, dtype=values.dtype)
result.fill(fill_value)
return result
@@ -416,8 +430,7 @@ def nanany(values, axis=None, skipna=True, mask=None):
>>> nanops.nanany(s)
False
"""
- values, _, _, _, _ = _get_values(values, skipna, fill_value=False,
- mask=mask)
+ values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask)
return values.any(axis)
@@ -449,12 +462,11 @@ def nanall(values, axis=None, skipna=True, mask=None):
>>> nanops.nanall(s)
False
"""
- values, _, _, _, _ = _get_values(values, skipna, fill_value=True,
- mask=mask)
+ values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask)
return values.all(axis)
-@disallow('M8')
+@disallow("M8")
def nansum(values, axis=None, skipna=True, min_count=0, mask=None):
"""
Sum the elements along an axis ignoring NaNs
@@ -479,21 +491,21 @@ def nansum(values, axis=None, skipna=True, min_count=0, mask=None):
>>> nanops.nansum(s)
3.0
"""
- values, mask, dtype, dtype_max, _ = _get_values(values, skipna,
- fill_value=0, mask=mask)
+ values, mask, dtype, dtype_max, _ = _get_values(
+ values, skipna, fill_value=0, mask=mask
+ )
dtype_sum = dtype_max
if is_float_dtype(dtype):
dtype_sum = dtype
elif is_timedelta64_dtype(dtype):
dtype_sum = np.float64
the_sum = values.sum(axis, dtype=dtype_sum)
- the_sum = _maybe_null_out(the_sum, axis, mask, values.shape,
- min_count=min_count)
+ the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
return _wrap_results(the_sum, dtype)
-@disallow('M8', DatetimeTZDtype)
+@disallow("M8", DatetimeTZDtype)
@bottleneck_switch()
def nanmean(values, axis=None, skipna=True, mask=None):
"""
@@ -520,12 +532,17 @@ def nanmean(values, axis=None, skipna=True, mask=None):
>>> nanops.nanmean(s)
1.5
"""
- values, mask, dtype, dtype_max, _ = _get_values(values, skipna,
- fill_value=0, mask=mask)
+ values, mask, dtype, dtype_max, _ = _get_values(
+ values, skipna, fill_value=0, mask=mask
+ )
dtype_sum = dtype_max
dtype_count = np.float64
- if (is_integer_dtype(dtype) or is_timedelta64_dtype(dtype) or
- is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)):
+ if (
+ is_integer_dtype(dtype)
+ or is_timedelta64_dtype(dtype)
+ or is_datetime64_dtype(dtype)
+ or is_datetime64tz_dtype(dtype)
+ ):
dtype_sum = np.float64
elif is_float_dtype(dtype):
dtype_sum = dtype
@@ -533,7 +550,7 @@ def nanmean(values, axis=None, skipna=True, mask=None):
count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
- if axis is not None and getattr(the_sum, 'ndim', False):
+ if axis is not None and getattr(the_sum, "ndim", False):
with np.errstate(all="ignore"):
# suppress division by zero warnings
the_mean = the_sum / count
@@ -546,7 +563,7 @@ def nanmean(values, axis=None, skipna=True, mask=None):
return _wrap_results(the_mean, dtype)
-@disallow('M8')
+@disallow("M8")
@bottleneck_switch()
def nanmedian(values, axis=None, skipna=True, mask=None):
"""
@@ -571,6 +588,7 @@ def nanmedian(values, axis=None, skipna=True, mask=None):
>>> nanops.nanmedian(s)
2.0
"""
+
def get_median(x):
mask = notna(x)
if not skipna and not mask.all():
@@ -579,7 +597,7 @@ def get_median(x):
values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask)
if not is_float_dtype(values):
- values = values.astype('f8')
+ values = values.astype("f8")
if mask is not None:
values[mask] = np.nan
@@ -595,7 +613,8 @@ def get_median(x):
if notempty:
if not skipna:
return _wrap_results(
- np.apply_along_axis(get_median, axis, values), dtype)
+ np.apply_along_axis(get_median, axis, values), dtype
+ )
# fastpath for the skipna case
return _wrap_results(np.nanmedian(values, axis), dtype)
@@ -614,10 +633,13 @@ def get_median(x):
return _wrap_results(get_median(values) if notempty else np.nan, dtype)
-def _get_counts_nanvar(value_counts: Tuple[int], mask: Optional[np.ndarray],
- axis: Optional[int], ddof: int,
- dtype=float) -> Tuple[Union[int, np.ndarray],
- Union[int, np.ndarray]]:
+def _get_counts_nanvar(
+ value_counts: Tuple[int],
+ mask: Optional[np.ndarray],
+ axis: Optional[int],
+ ddof: int,
+ dtype=float,
+) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]:
""" Get the count of non-null values along an axis, accounting
for degrees of freedom.
@@ -656,7 +678,7 @@ def _get_counts_nanvar(value_counts: Tuple[int], mask: Optional[np.ndarray],
return count, d
-@disallow('M8')
+@disallow("M8")
@bottleneck_switch(ddof=1)
def nanstd(values, axis=None, skipna=True, ddof=1, mask=None):
"""
@@ -686,12 +708,11 @@ def nanstd(values, axis=None, skipna=True, ddof=1, mask=None):
>>> nanops.nanstd(s)
1.0
"""
- result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof,
- mask=mask))
+ result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask))
return _wrap_results(result, values.dtype)
-@disallow('M8')
+@disallow("M8")
@bottleneck_switch(ddof=1)
def nanvar(values, axis=None, skipna=True, ddof=1, mask=None):
"""
@@ -725,13 +746,12 @@ def nanvar(values, axis=None, skipna=True, ddof=1, mask=None):
dtype = values.dtype
mask = _maybe_get_mask(values, skipna, mask)
if is_any_int_dtype(values):
- values = values.astype('f8')
+ values = values.astype("f8")
if mask is not None:
values[mask] = np.nan
if is_float_dtype(values):
- count, d = _get_counts_nanvar(values.shape, mask, axis, ddof,
- values.dtype)
+ count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
else:
count, d = _get_counts_nanvar(values.shape, mask, axis, ddof)
@@ -761,7 +781,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1, mask=None):
return _wrap_results(result, values.dtype)
-@disallow('M8', 'm8')
+@disallow("M8", "m8")
def nansem(values, axis=None, skipna=True, ddof=1, mask=None):
"""
Compute the standard error in the mean along given axis while ignoring NaNs
@@ -797,7 +817,7 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None):
mask = _maybe_get_mask(values, skipna, mask)
if not is_float_dtype(values.dtype):
- values = values.astype('f8')
+ values = values.astype("f8")
count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
var = nanvar(values, axis, skipna, ddof=ddof)
@@ -806,20 +826,18 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None):
def _nanminmax(meth, fill_value_typ):
-
- @bottleneck_switch(name='nan' + meth)
+ @bottleneck_switch(name="nan" + meth)
def reduction(values, axis=None, skipna=True, mask=None):
values, mask, dtype, dtype_max, fill_value = _get_values(
- values, skipna, fill_value_typ=fill_value_typ, mask=mask)
+ values, skipna, fill_value_typ=fill_value_typ, mask=mask
+ )
- if ((axis is not None and values.shape[axis] == 0) or
- values.size == 0):
+ if (axis is not None and values.shape[axis] == 0) or values.size == 0:
try:
result = getattr(values, meth)(axis, dtype=dtype_max)
result.fill(np.nan)
- except (AttributeError, TypeError,
- ValueError, np.core._internal.AxisError):
+ except (AttributeError, TypeError, ValueError, np.core._internal.AxisError):
result = np.nan
else:
result = getattr(values, meth)(axis)
@@ -830,11 +848,11 @@ def reduction(values, axis=None, skipna=True, mask=None):
return reduction
-nanmin = _nanminmax('min', fill_value_typ='+inf')
-nanmax = _nanminmax('max', fill_value_typ='-inf')
+nanmin = _nanminmax("min", fill_value_typ="+inf")
+nanmax = _nanminmax("max", fill_value_typ="-inf")
-@disallow('O')
+@disallow("O")
def nanargmax(values, axis=None, skipna=True, mask=None):
"""
Parameters
@@ -858,13 +876,14 @@ def nanargmax(values, axis=None, skipna=True, mask=None):
4
"""
values, mask, dtype, _, _ = _get_values(
- values, True, fill_value_typ='-inf', mask=mask)
+ values, True, fill_value_typ="-inf", mask=mask
+ )
result = values.argmax(axis)
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result
-@disallow('O')
+@disallow("O")
def nanargmin(values, axis=None, skipna=True, mask=None):
"""
Parameters
@@ -888,13 +907,14 @@ def nanargmin(values, axis=None, skipna=True, mask=None):
0
"""
values, mask, dtype, _, _ = _get_values(
- values, True, fill_value_typ='+inf', mask=mask)
+ values, True, fill_value_typ="+inf", mask=mask
+ )
result = values.argmin(axis)
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result
-@disallow('M8', 'm8')
+@disallow("M8", "m8")
def nanskew(values, axis=None, skipna=True, mask=None):
""" Compute the sample skewness.
@@ -926,7 +946,7 @@ def nanskew(values, axis=None, skipna=True, mask=None):
values = com.values_from_object(values)
mask = _maybe_get_mask(values, skipna, mask)
if not is_float_dtype(values.dtype):
- values = values.astype('f8')
+ values = values.astype("f8")
count = _get_counts(values.shape, mask, axis)
else:
count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
@@ -954,7 +974,7 @@ def nanskew(values, axis=None, skipna=True, mask=None):
m2 = _zero_out_fperr(m2)
m3 = _zero_out_fperr(m3)
- with np.errstate(invalid='ignore', divide='ignore'):
+ with np.errstate(invalid="ignore", divide="ignore"):
result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5)
dtype = values.dtype
@@ -972,7 +992,7 @@ def nanskew(values, axis=None, skipna=True, mask=None):
return result
-@disallow('M8', 'm8')
+@disallow("M8", "m8")
def nankurt(values, axis=None, skipna=True, mask=None):
"""
Compute the sample excess kurtosis
@@ -1005,7 +1025,7 @@ def nankurt(values, axis=None, skipna=True, mask=None):
values = com.values_from_object(values)
mask = _maybe_get_mask(values, skipna, mask)
if not is_float_dtype(values.dtype):
- values = values.astype('f8')
+ values = values.astype("f8")
count = _get_counts(values.shape, mask, axis)
else:
count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
@@ -1026,7 +1046,7 @@ def nankurt(values, axis=None, skipna=True, mask=None):
m2 = adjusted2.sum(axis, dtype=np.float64)
m4 = adjusted4.sum(axis, dtype=np.float64)
- with np.errstate(invalid='ignore', divide='ignore'):
+ with np.errstate(invalid="ignore", divide="ignore"):
adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
numer = count * (count + 1) * (count - 1) * m4
denom = (count - 2) * (count - 3) * m2 ** 2
@@ -1046,7 +1066,7 @@ def nankurt(values, axis=None, skipna=True, mask=None):
if denom == 0:
return 0
- with np.errstate(invalid='ignore', divide='ignore'):
+ with np.errstate(invalid="ignore", divide="ignore"):
result = numer / denom - adj
dtype = values.dtype
@@ -1060,7 +1080,7 @@ def nankurt(values, axis=None, skipna=True, mask=None):
return result
-@disallow('M8', 'm8')
+@disallow("M8", "m8")
def nanprod(values, axis=None, skipna=True, min_count=0, mask=None):
"""
Parameters
@@ -1093,18 +1113,17 @@ def nanprod(values, axis=None, skipna=True, min_count=0, mask=None):
values = values.copy()
values[mask] = 1
result = values.prod(axis)
- return _maybe_null_out(result, axis, mask, values.shape,
- min_count=min_count)
+ return _maybe_null_out(result, axis, mask, values.shape, min_count=min_count)
-def _maybe_arg_null_out(result: np.ndarray, axis: Optional[int],
- mask: Optional[np.ndarray],
- skipna: bool) -> Union[np.ndarray, int]:
+def _maybe_arg_null_out(
+ result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], skipna: bool
+) -> Union[np.ndarray, int]:
# helper function for nanargmin/nanargmax
if mask is None:
return result
- if axis is None or not getattr(result, 'ndim', False):
+ if axis is None or not getattr(result, "ndim", False):
if skipna:
if mask.all():
result = -1
@@ -1121,8 +1140,12 @@ def _maybe_arg_null_out(result: np.ndarray, axis: Optional[int],
return result
-def _get_counts(values_shape: Tuple[int], mask: Optional[np.ndarray],
- axis: Optional[int], dtype=float) -> Union[int, np.ndarray]:
+def _get_counts(
+ values_shape: Tuple[int],
+ mask: Optional[np.ndarray],
+ axis: Optional[int],
+ dtype=float,
+) -> Union[int, np.ndarray]:
""" Get the count of non-null values along an axis
Parameters
@@ -1161,18 +1184,21 @@ def _get_counts(values_shape: Tuple[int], mask: Optional[np.ndarray],
return np.array(count, dtype=dtype)
-def _maybe_null_out(result: np.ndarray, axis: Optional[int],
- mask: Optional[np.ndarray], shape: Tuple,
- min_count: int = 1) -> np.ndarray:
- if (mask is not None and axis is not None and
- getattr(result, 'ndim', False)):
+def _maybe_null_out(
+ result: np.ndarray,
+ axis: Optional[int],
+ mask: Optional[np.ndarray],
+ shape: Tuple,
+ min_count: int = 1,
+) -> np.ndarray:
+ if mask is not None and axis is not None and getattr(result, "ndim", False):
null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
if np.any(null_mask):
if is_numeric_dtype(result):
if np.iscomplexobj(result):
- result = result.astype('c16')
+ result = result.astype("c16")
else:
- result = result.astype('f8')
+ result = result.astype("f8")
result[null_mask] = np.nan
else:
# GH12941, use None to auto cast null
@@ -1191,19 +1217,19 @@ def _maybe_null_out(result: np.ndarray, axis: Optional[int],
def _zero_out_fperr(arg):
# #18044 reference this behavior to fix rolling skew/kurt issue
if isinstance(arg, np.ndarray):
- with np.errstate(invalid='ignore'):
+ with np.errstate(invalid="ignore"):
return np.where(np.abs(arg) < 1e-14, 0, arg)
else:
return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
-@disallow('M8', 'm8')
-def nancorr(a, b, method='pearson', min_periods=None):
+@disallow("M8", "m8")
+def nancorr(a, b, method="pearson", min_periods=None):
"""
a, b: ndarrays
"""
if len(a) != len(b):
- raise AssertionError('Operands to nancorr must have same size')
+ raise AssertionError("Operands to nancorr must have same size")
if min_periods is None:
min_periods = 1
@@ -1221,7 +1247,7 @@ def nancorr(a, b, method='pearson', min_periods=None):
def get_corr_func(method):
- if method in ['kendall', 'spearman']:
+ if method in ["kendall", "spearman"]:
from scipy.stats import kendalltau, spearmanr
elif callable(method):
return method
@@ -1238,18 +1264,14 @@ def _kendall(a, b):
def _spearman(a, b):
return spearmanr(a, b)[0]
- _cor_methods = {
- 'pearson': _pearson,
- 'kendall': _kendall,
- 'spearman': _spearman
- }
+ _cor_methods = {"pearson": _pearson, "kendall": _kendall, "spearman": _spearman}
return _cor_methods[method]
-@disallow('M8', 'm8')
+@disallow("M8", "m8")
def nancov(a, b, min_periods=None):
if len(a) != len(b):
- raise AssertionError('Operands to nancov must have same size')
+ raise AssertionError("Operands to nancov must have same size")
if min_periods is None:
min_periods = 1
@@ -1284,10 +1306,12 @@ def _ensure_numeric(x):
try:
x = complex(x)
except Exception:
- raise TypeError('Could not convert {value!s} to numeric'
- .format(value=x))
+ raise TypeError(
+ "Could not convert {value!s} to numeric".format(value=x)
+ )
return x
+
# NA-friendly array comparisons
@@ -1297,12 +1321,12 @@ def f(x, y):
ymask = isna(y)
mask = xmask | ymask
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = op(x, y)
if mask.any():
if is_bool_dtype(result):
- result = result.astype('O')
+ result = result.astype("O")
np.putmask(result, mask, np.nan)
return result
@@ -1344,8 +1368,7 @@ def _nanpercentile_1d(values, mask, q, na_value, interpolation):
if lib.is_scalar(q):
return na_value
else:
- return np.array([na_value] * len(q),
- dtype=values.dtype)
+ return np.array([na_value] * len(q), dtype=values.dtype)
return np.percentile(values, q, interpolation=interpolation)
@@ -1372,8 +1395,9 @@ def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation):
"""
if not lib.is_scalar(mask) and mask.any():
if ndim == 1:
- return _nanpercentile_1d(values, mask, q, na_value,
- interpolation=interpolation)
+ return _nanpercentile_1d(
+ values, mask, q, na_value, interpolation=interpolation
+ )
else:
# for nonconsolidatable blocks mask is 1D, but values 2D
if mask.ndim < values.ndim:
@@ -1381,9 +1405,10 @@ def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation):
if axis == 0:
values = values.T
mask = mask.T
- result = [_nanpercentile_1d(val, m, q, na_value,
- interpolation=interpolation)
- for (val, m) in zip(list(values), list(mask))]
+ result = [
+ _nanpercentile_1d(val, m, q, na_value, interpolation=interpolation)
+ for (val, m) in zip(list(values), list(mask))
+ ]
result = np.array(result, dtype=values.dtype, copy=False).T
return result
else:
diff --git a/pandas/core/ops.py b/pandas/core/ops.py
index 5dd8455073212..5c58a1433ba3c 100644
--- a/pandas/core/ops.py
+++ b/pandas/core/ops.py
@@ -16,16 +16,34 @@
from pandas.util._decorators import Appender
from pandas.core.dtypes.cast import (
- construct_1d_object_array_from_listlike, find_common_type,
- maybe_upcast_putmask)
+ construct_1d_object_array_from_listlike,
+ find_common_type,
+ maybe_upcast_putmask,
+)
from pandas.core.dtypes.common import (
- ensure_object, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
- is_datetime64tz_dtype, is_datetimelike_v_numeric, is_extension_array_dtype,
- is_integer_dtype, is_list_like, is_object_dtype, is_period_dtype,
- is_scalar, is_timedelta64_dtype, needs_i8_conversion)
+ ensure_object,
+ is_bool_dtype,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_datetimelike_v_numeric,
+ is_extension_array_dtype,
+ is_integer_dtype,
+ is_list_like,
+ is_object_dtype,
+ is_period_dtype,
+ is_scalar,
+ is_timedelta64_dtype,
+ needs_i8_conversion,
+)
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCIndex, ABCIndexClass, ABCSeries, ABCSparseArray,
- ABCSparseSeries)
+ ABCDataFrame,
+ ABCIndex,
+ ABCIndexClass,
+ ABCSeries,
+ ABCSparseArray,
+ ABCSparseSeries,
+)
from pandas.core.dtypes.missing import isna, notna
import pandas as pd
@@ -80,8 +98,8 @@ def _maybe_match_name(a, b):
--------
pandas.core.common.consensus_name_attr
"""
- a_has = hasattr(a, 'name')
- b_has = hasattr(b, 'name')
+ a_has = hasattr(a, "name")
+ b_has = hasattr(b, "name")
if a_has and b_has:
if a.name == b.name:
return a.name
@@ -137,6 +155,7 @@ def maybe_upcast_for_op(obj):
# Reversed Operations not available in the stdlib operator module.
# Defining these instead of using lambdas allows us to reference them by name.
+
def radd(left, right):
return right + left
@@ -166,8 +185,9 @@ def rmod(left, right):
# formatting operation; this is a TypeError
# otherwise perform the op
if isinstance(right, str):
- raise TypeError("{typ} cannot perform the operation mod".format(
- typ=type(left).__name__))
+ raise TypeError(
+ "{typ} cannot perform the operation mod".format(typ=type(left).__name__)
+ )
return right % left
@@ -194,6 +214,7 @@ def rxor(left, right):
# -----------------------------------------------------------------------------
+
def make_invalid_op(name):
"""
Return a binary method that always raises a TypeError.
@@ -206,9 +227,12 @@ def make_invalid_op(name):
-------
invalid_op : function
"""
+
def invalid_op(self, other=None):
- raise TypeError("cannot perform {name} with this index type: "
- "{typ}".format(name=name, typ=type(self).__name__))
+ raise TypeError(
+ "cannot perform {name} with this index type: "
+ "{typ}".format(name=name, typ=type(self).__name__)
+ )
invalid_op.__name__ = name
return invalid_op
@@ -239,18 +263,18 @@ def _gen_eval_kwargs(name):
# Series appear to only pass __add__, __radd__, ...
# but DataFrame gets both these dunder names _and_ non-dunder names
# add, radd, ...
- name = name.replace('__', '')
+ name = name.replace("__", "")
- if name.startswith('r'):
- if name not in ['radd', 'rand', 'ror', 'rxor']:
+ if name.startswith("r"):
+ if name not in ["radd", "rand", "ror", "rxor"]:
# Exclude commutative operations
- kwargs['reversed'] = True
+ kwargs["reversed"] = True
- if name in ['truediv', 'rtruediv']:
- kwargs['truediv'] = True
+ if name in ["truediv", "rtruediv"]:
+ kwargs["truediv"] = True
- if name in ['ne']:
- kwargs['masker'] = True
+ if name in ["ne"]:
+ kwargs["masker"] = True
return kwargs
@@ -269,11 +293,11 @@ def _gen_fill_zeros(name):
-------
fill_value : {None, np.nan, np.inf}
"""
- name = name.strip('__')
- if 'div' in name:
+ name = name.strip("__")
+ if "div" in name:
# truediv, floordiv, div, and reversed variants
fill_value = np.inf
- elif 'mod' in name:
+ elif "mod" in name:
# mod, rmod
fill_value = np.nan
else:
@@ -295,15 +319,15 @@ def _get_frame_op_default_axis(name):
-------
default_axis: str or None
"""
- if name.replace('__r', '__') in ['__and__', '__or__', '__xor__']:
+ if name.replace("__r", "__") in ["__and__", "__or__", "__xor__"]:
# bool methods
- return 'columns'
- elif name.startswith('__'):
+ return "columns"
+ elif name.startswith("__"):
# __add__, __mul__, ...
return None
else:
# add, mul, ...
- return 'columns'
+ return "columns"
def _get_opstr(op, cls):
@@ -321,41 +345,43 @@ def _get_opstr(op, cls):
op_str : string or None
"""
# numexpr is available for non-sparse classes
- subtyp = getattr(cls, '_subtyp', '')
- use_numexpr = 'sparse' not in subtyp
+ subtyp = getattr(cls, "_subtyp", "")
+ use_numexpr = "sparse" not in subtyp
if not use_numexpr:
# if we're not using numexpr, then don't pass a str_rep
return None
- return {operator.add: '+',
- radd: '+',
- operator.mul: '*',
- rmul: '*',
- operator.sub: '-',
- rsub: '-',
- operator.truediv: '/',
- rtruediv: '/',
- operator.floordiv: '//',
- rfloordiv: '//',
- operator.mod: None, # TODO: Why None for mod but '%' for rmod?
- rmod: '%',
- operator.pow: '**',
- rpow: '**',
- operator.eq: '==',
- operator.ne: '!=',
- operator.le: '<=',
- operator.lt: '<',
- operator.ge: '>=',
- operator.gt: '>',
- operator.and_: '&',
- rand_: '&',
- operator.or_: '|',
- ror_: '|',
- operator.xor: '^',
- rxor: '^',
- divmod: None,
- rdivmod: None}[op]
+ return {
+ operator.add: "+",
+ radd: "+",
+ operator.mul: "*",
+ rmul: "*",
+ operator.sub: "-",
+ rsub: "-",
+ operator.truediv: "/",
+ rtruediv: "/",
+ operator.floordiv: "//",
+ rfloordiv: "//",
+ operator.mod: None, # TODO: Why None for mod but '%' for rmod?
+ rmod: "%",
+ operator.pow: "**",
+ rpow: "**",
+ operator.eq: "==",
+ operator.ne: "!=",
+ operator.le: "<=",
+ operator.lt: "<",
+ operator.ge: ">=",
+ operator.gt: ">",
+ operator.and_: "&",
+ rand_: "&",
+ operator.or_: "|",
+ ror_: "|",
+ operator.xor: "^",
+ rxor: "^",
+ divmod: None,
+ rdivmod: None,
+ }[op]
def _get_op_name(op, special):
@@ -372,9 +398,9 @@ def _get_op_name(op, special):
-------
op_name : str
"""
- opname = op.__name__.strip('_')
+ opname = op.__name__.strip("_")
if special:
- opname = '__{opname}__'.format(opname=opname)
+ opname = "__{opname}__".format(opname=opname)
return opname
@@ -564,77 +590,89 @@ def _get_op_name(op, special):
_op_descriptions = {
# Arithmetic Operators
- 'add': {'op': '+',
- 'desc': 'Addition',
- 'reverse': 'radd',
- 'series_examples': _add_example_SERIES},
- 'sub': {'op': '-',
- 'desc': 'Subtraction',
- 'reverse': 'rsub',
- 'series_examples': _sub_example_SERIES},
- 'mul': {'op': '*',
- 'desc': 'Multiplication',
- 'reverse': 'rmul',
- 'series_examples': _mul_example_SERIES,
- 'df_examples': None},
- 'mod': {'op': '%',
- 'desc': 'Modulo',
- 'reverse': 'rmod',
- 'series_examples': _mod_example_SERIES},
- 'pow': {'op': '**',
- 'desc': 'Exponential power',
- 'reverse': 'rpow',
- 'series_examples': _pow_example_SERIES,
- 'df_examples': None},
- 'truediv': {'op': '/',
- 'desc': 'Floating division',
- 'reverse': 'rtruediv',
- 'series_examples': _div_example_SERIES,
- 'df_examples': None},
- 'floordiv': {'op': '//',
- 'desc': 'Integer division',
- 'reverse': 'rfloordiv',
- 'series_examples': _floordiv_example_SERIES,
- 'df_examples': None},
- 'divmod': {'op': 'divmod',
- 'desc': 'Integer division and modulo',
- 'reverse': 'rdivmod',
- 'series_examples': None,
- 'df_examples': None},
-
+ "add": {
+ "op": "+",
+ "desc": "Addition",
+ "reverse": "radd",
+ "series_examples": _add_example_SERIES,
+ },
+ "sub": {
+ "op": "-",
+ "desc": "Subtraction",
+ "reverse": "rsub",
+ "series_examples": _sub_example_SERIES,
+ },
+ "mul": {
+ "op": "*",
+ "desc": "Multiplication",
+ "reverse": "rmul",
+ "series_examples": _mul_example_SERIES,
+ "df_examples": None,
+ },
+ "mod": {
+ "op": "%",
+ "desc": "Modulo",
+ "reverse": "rmod",
+ "series_examples": _mod_example_SERIES,
+ },
+ "pow": {
+ "op": "**",
+ "desc": "Exponential power",
+ "reverse": "rpow",
+ "series_examples": _pow_example_SERIES,
+ "df_examples": None,
+ },
+ "truediv": {
+ "op": "/",
+ "desc": "Floating division",
+ "reverse": "rtruediv",
+ "series_examples": _div_example_SERIES,
+ "df_examples": None,
+ },
+ "floordiv": {
+ "op": "//",
+ "desc": "Integer division",
+ "reverse": "rfloordiv",
+ "series_examples": _floordiv_example_SERIES,
+ "df_examples": None,
+ },
+ "divmod": {
+ "op": "divmod",
+ "desc": "Integer division and modulo",
+ "reverse": "rdivmod",
+ "series_examples": None,
+ "df_examples": None,
+ },
# Comparison Operators
- 'eq': {'op': '==',
- 'desc': 'Equal to',
- 'reverse': None,
- 'series_examples': None},
- 'ne': {'op': '!=',
- 'desc': 'Not equal to',
- 'reverse': None,
- 'series_examples': None},
- 'lt': {'op': '<',
- 'desc': 'Less than',
- 'reverse': None,
- 'series_examples': None},
- 'le': {'op': '<=',
- 'desc': 'Less than or equal to',
- 'reverse': None,
- 'series_examples': None},
- 'gt': {'op': '>',
- 'desc': 'Greater than',
- 'reverse': None,
- 'series_examples': None},
- 'ge': {'op': '>=',
- 'desc': 'Greater than or equal to',
- 'reverse': None,
- 'series_examples': None}
+ "eq": {"op": "==", "desc": "Equal to", "reverse": None, "series_examples": None},
+ "ne": {
+ "op": "!=",
+ "desc": "Not equal to",
+ "reverse": None,
+ "series_examples": None,
+ },
+ "lt": {"op": "<", "desc": "Less than", "reverse": None, "series_examples": None},
+ "le": {
+ "op": "<=",
+ "desc": "Less than or equal to",
+ "reverse": None,
+ "series_examples": None,
+ },
+ "gt": {"op": ">", "desc": "Greater than", "reverse": None, "series_examples": None},
+ "ge": {
+ "op": ">=",
+ "desc": "Greater than or equal to",
+ "reverse": None,
+ "series_examples": None,
+ },
} # type: Dict[str, Dict[str, Optional[str]]]
_op_names = list(_op_descriptions.keys())
for key in _op_names:
- reverse_op = _op_descriptions[key]['reverse']
+ reverse_op = _op_descriptions[key]["reverse"]
if reverse_op is not None:
_op_descriptions[reverse_op] = _op_descriptions[key].copy()
- _op_descriptions[reverse_op]['reverse'] = key
+ _op_descriptions[reverse_op]["reverse"] = key
_flex_doc_SERIES = """
Return {desc} of series and other, element-wise (binary operator `{op_name}`).
@@ -1007,42 +1045,43 @@ def _make_flex_doc(op_name, typ):
-------
doc : str
"""
- op_name = op_name.replace('__', '')
+ op_name = op_name.replace("__", "")
op_desc = _op_descriptions[op_name]
- if op_name.startswith('r'):
- equiv = 'other ' + op_desc['op'] + ' ' + typ
+ if op_name.startswith("r"):
+ equiv = "other " + op_desc["op"] + " " + typ
else:
- equiv = typ + ' ' + op_desc['op'] + ' other'
+ equiv = typ + " " + op_desc["op"] + " other"
- if typ == 'series':
+ if typ == "series":
base_doc = _flex_doc_SERIES
doc_no_examples = base_doc.format(
- desc=op_desc['desc'],
+ desc=op_desc["desc"],
op_name=op_name,
equiv=equiv,
- reverse=op_desc['reverse']
+ reverse=op_desc["reverse"],
)
- if op_desc['series_examples']:
- doc = doc_no_examples + op_desc['series_examples']
+ if op_desc["series_examples"]:
+ doc = doc_no_examples + op_desc["series_examples"]
else:
doc = doc_no_examples
- elif typ == 'dataframe':
+ elif typ == "dataframe":
base_doc = _flex_doc_FRAME
doc = base_doc.format(
- desc=op_desc['desc'],
+ desc=op_desc["desc"],
op_name=op_name,
equiv=equiv,
- reverse=op_desc['reverse']
+ reverse=op_desc["reverse"],
)
else:
- raise AssertionError('Invalid typ argument.')
+ raise AssertionError("Invalid typ argument.")
return doc
# -----------------------------------------------------------------------------
# Masking NA values and fallbacks for operations numpy does not support
+
def fill_binop(left, right, fill_value):
"""
If a non-None fill_value is given, replace null entries in left and right
@@ -1097,8 +1136,7 @@ def mask_cmp_op(x, y, op):
if isinstance(y, (np.ndarray, ABCSeries)):
yrav = y.ravel()
mask = notna(xrav) & notna(yrav)
- result[mask] = op(np.array(list(xrav[mask])),
- np.array(list(yrav[mask])))
+ result[mask] = op(np.array(list(xrav[mask])), np.array(list(yrav[mask])))
else:
mask = notna(xrav)
result[mask] = op(np.array(list(xrav[mask])), y)
@@ -1140,12 +1178,11 @@ def masked_arith_op(x, y, op):
# Without specifically raising here we get mismatched
# errors in Py3 (TypeError) vs Py2 (ValueError)
# Note: Only = an issue in DataFrame case
- raise ValueError('Cannot broadcast operands together.')
+ raise ValueError("Cannot broadcast operands together.")
if mask.any():
- with np.errstate(all='ignore'):
- result[mask] = op(xrav[mask],
- com.values_from_object(yrav[mask]))
+ with np.errstate(all="ignore"):
+ result[mask] = op(xrav[mask], com.values_from_object(yrav[mask]))
else:
assert is_scalar(y), type(y)
@@ -1161,7 +1198,7 @@ def masked_arith_op(x, y, op):
mask = np.where(y == 1, False, mask)
if mask.any():
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result[mask] = op(xrav[mask], y)
result, changed = maybe_upcast_putmask(result, ~mask, np.nan)
@@ -1193,14 +1230,18 @@ def invalid_comparison(left, right, op):
elif op is operator.ne:
res_values = np.ones(left.shape, dtype=bool)
else:
- raise TypeError("Invalid comparison between dtype={dtype} and {typ}"
- .format(dtype=left.dtype, typ=type(right).__name__))
+ raise TypeError(
+ "Invalid comparison between dtype={dtype} and {typ}".format(
+ dtype=left.dtype, typ=type(right).__name__
+ )
+ )
return res_values
# -----------------------------------------------------------------------------
# Dispatch logic
+
def should_series_dispatch(left, right, op):
"""
Identify cases where a DataFrame operation should dispatch to its
@@ -1226,8 +1267,9 @@ def should_series_dispatch(left, right, op):
ldtype = left.dtypes.iloc[0]
rdtype = right.dtypes.iloc[0]
- if ((is_timedelta64_dtype(ldtype) and is_integer_dtype(rdtype)) or
- (is_timedelta64_dtype(rdtype) and is_integer_dtype(ldtype))):
+ if (is_timedelta64_dtype(ldtype) and is_integer_dtype(rdtype)) or (
+ is_timedelta64_dtype(rdtype) and is_integer_dtype(ldtype)
+ ):
# numpy integer dtypes as timedelta64 dtypes in this scenario
return True
@@ -1263,15 +1305,13 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None):
if lib.is_scalar(right) or np.ndim(right) == 0:
def column_op(a, b):
- return {i: func(a.iloc[:, i], b)
- for i in range(len(a.columns))}
+ return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))}
elif isinstance(right, ABCDataFrame):
assert right._indexed_same(left)
def column_op(a, b):
- return {i: func(a.iloc[:, i], b.iloc[:, i])
- for i in range(len(a.columns))}
+ return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))}
elif isinstance(right, ABCSeries) and axis == "columns":
# We only get here if called via left._combine_match_columns,
@@ -1279,15 +1319,13 @@ def column_op(a, b):
assert right.index.equals(left.columns)
def column_op(a, b):
- return {i: func(a.iloc[:, i], b.iloc[i])
- for i in range(len(a.columns))}
+ return {i: func(a.iloc[:, i], b.iloc[i]) for i in range(len(a.columns))}
elif isinstance(right, ABCSeries):
assert right.index.equals(left.index) # Handle other cases later
def column_op(a, b):
- return {i: func(a.iloc[:, i], b)
- for i in range(len(a.columns))}
+ return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))}
else:
# Remaining cases have less-obvious dispatch rules
@@ -1324,15 +1362,17 @@ def dispatch_to_index_op(op, left, right, index_class):
# avoid accidentally allowing integer add/sub. For datetime64[tz] dtypes,
# left_idx may inherit a freq from a cached DatetimeIndex.
# See discussion in GH#19147.
- if getattr(left_idx, 'freq', None) is not None:
+ if getattr(left_idx, "freq", None) is not None:
left_idx = left_idx._shallow_copy(freq=None)
try:
result = op(left_idx, right)
except NullFrequencyError:
# DatetimeIndex and TimedeltaIndex with freq == None raise ValueError
# on add/sub of integers (or int-like). We re-raise as a TypeError.
- raise TypeError('incompatible type for a datetime/timedelta '
- 'operation [{name}]'.format(name=op.__name__))
+ raise TypeError(
+ "incompatible type for a datetime/timedelta "
+ "operation [{name}]".format(name=op.__name__)
+ )
return result
@@ -1359,9 +1399,8 @@ def dispatch_to_extension_op(op, left, right):
res_values = op(new_left, new_right)
res_name = get_op_result_name(left, right)
- if op.__name__ in ['divmod', 'rdivmod']:
- return _construct_divmod_result(
- left, res_values, left.index, res_name)
+ if op.__name__ in ["divmod", "rdivmod"]:
+ return _construct_divmod_result(left, res_values, left.index, res_name)
return _construct_result(left, res_values, left.index, res_name)
@@ -1370,6 +1409,7 @@ def dispatch_to_extension_op(op, left, right):
# Functions that add arithmetic methods to objects, given arithmetic factory
# methods
+
def _get_method_wrappers(cls):
"""
Find the appropriate operation-wrappers to use when defining flex/special
@@ -1451,33 +1491,39 @@ def _create_methods(cls, arith_method, comp_method, bool_method, special):
rpow=arith_method(cls, rpow, special),
rmod=arith_method(cls, rmod, special))
# yapf: enable
- new_methods['div'] = new_methods['truediv']
- new_methods['rdiv'] = new_methods['rtruediv']
+ new_methods["div"] = new_methods["truediv"]
+ new_methods["rdiv"] = new_methods["rtruediv"]
if have_divmod:
# divmod doesn't have an op that is supported by numexpr
- new_methods['divmod'] = arith_method(cls, divmod, special)
- new_methods['rdivmod'] = arith_method(cls, rdivmod, special)
+ new_methods["divmod"] = arith_method(cls, divmod, special)
+ new_methods["rdivmod"] = arith_method(cls, rdivmod, special)
- new_methods.update(dict(
- eq=comp_method(cls, operator.eq, special),
- ne=comp_method(cls, operator.ne, special),
- lt=comp_method(cls, operator.lt, special),
- gt=comp_method(cls, operator.gt, special),
- le=comp_method(cls, operator.le, special),
- ge=comp_method(cls, operator.ge, special)))
+ new_methods.update(
+ dict(
+ eq=comp_method(cls, operator.eq, special),
+ ne=comp_method(cls, operator.ne, special),
+ lt=comp_method(cls, operator.lt, special),
+ gt=comp_method(cls, operator.gt, special),
+ le=comp_method(cls, operator.le, special),
+ ge=comp_method(cls, operator.ge, special),
+ )
+ )
if bool_method:
new_methods.update(
- dict(and_=bool_method(cls, operator.and_, special),
- or_=bool_method(cls, operator.or_, special),
- # For some reason ``^`` wasn't used in original.
- xor=bool_method(cls, operator.xor, special),
- rand_=bool_method(cls, rand_, special),
- ror_=bool_method(cls, ror_, special),
- rxor=bool_method(cls, rxor, special)))
+ dict(
+ and_=bool_method(cls, operator.and_, special),
+ or_=bool_method(cls, operator.or_, special),
+ # For some reason ``^`` wasn't used in original.
+ xor=bool_method(cls, operator.xor, special),
+ rand_=bool_method(cls, rand_, special),
+ ror_=bool_method(cls, ror_, special),
+ rxor=bool_method(cls, rxor, special),
+ )
+ )
if special:
- dunderize = lambda x: '__{name}__'.format(name=x.strip('_'))
+ dunderize = lambda x: "__{name}__".format(name=x.strip("_"))
else:
dunderize = lambda x: x
new_methods = {dunderize(k): v for k, v in new_methods.items()}
@@ -1490,8 +1536,7 @@ def add_methods(cls, new_methods):
# of the same name, it is OK to over-write it. The exception is
# inplace methods (__iadd__, __isub__, ...) for SparseArray, which
# retain the np.ndarray versions.
- force = not (issubclass(cls, ABCSparseArray) and
- name.startswith('__i'))
+ force = not (issubclass(cls, ABCSparseArray) and name.startswith("__i"))
if force or name not in cls.__dict__:
setattr(cls, name, method)
@@ -1509,8 +1554,9 @@ def add_special_arithmetic_methods(cls):
special methods will be defined and pinned to this class
"""
_, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls)
- new_methods = _create_methods(cls, arith_method, comp_method, bool_method,
- special=True)
+ new_methods = _create_methods(
+ cls, arith_method, comp_method, bool_method, special=True
+ )
# inplace operators (I feel like these should get passed an `inplace=True`
# or just be removed
@@ -1524,8 +1570,9 @@ def f(self, other):
# this makes sure that we are aligned like the input
# we are updating inplace so we want to ignore is_copy
- self._update_inplace(result.reindex_like(self, copy=False)._data,
- verify_is_copy=False)
+ self._update_inplace(
+ result.reindex_like(self, copy=False)._data, verify_is_copy=False
+ )
return self
@@ -1533,18 +1580,24 @@ def f(self, other):
return f
new_methods.update(
- dict(__iadd__=_wrap_inplace_method(new_methods["__add__"]),
- __isub__=_wrap_inplace_method(new_methods["__sub__"]),
- __imul__=_wrap_inplace_method(new_methods["__mul__"]),
- __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]),
- __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]),
- __imod__=_wrap_inplace_method(new_methods["__mod__"]),
- __ipow__=_wrap_inplace_method(new_methods["__pow__"])))
+ dict(
+ __iadd__=_wrap_inplace_method(new_methods["__add__"]),
+ __isub__=_wrap_inplace_method(new_methods["__sub__"]),
+ __imul__=_wrap_inplace_method(new_methods["__mul__"]),
+ __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]),
+ __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]),
+ __imod__=_wrap_inplace_method(new_methods["__mod__"]),
+ __ipow__=_wrap_inplace_method(new_methods["__pow__"]),
+ )
+ )
new_methods.update(
- dict(__iand__=_wrap_inplace_method(new_methods["__and__"]),
- __ior__=_wrap_inplace_method(new_methods["__or__"]),
- __ixor__=_wrap_inplace_method(new_methods["__xor__"])))
+ dict(
+ __iand__=_wrap_inplace_method(new_methods["__and__"]),
+ __ior__=_wrap_inplace_method(new_methods["__or__"]),
+ __ixor__=_wrap_inplace_method(new_methods["__xor__"]),
+ )
+ )
add_methods(cls, new_methods=new_methods)
@@ -1560,14 +1613,18 @@ def add_flex_arithmetic_methods(cls):
flex methods will be defined and pinned to this class
"""
flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls)
- new_methods = _create_methods(cls, flex_arith_method,
- flex_comp_method, bool_method=None,
- special=False)
- new_methods.update(dict(multiply=new_methods['mul'],
- subtract=new_methods['sub'],
- divide=new_methods['div']))
+ new_methods = _create_methods(
+ cls, flex_arith_method, flex_comp_method, bool_method=None, special=False
+ )
+ new_methods.update(
+ dict(
+ multiply=new_methods["mul"],
+ subtract=new_methods["sub"],
+ divide=new_methods["div"],
+ )
+ )
# opt out of bool flex methods for now
- assert not any(kname in new_methods for kname in ('ror_', 'rxor', 'rand_'))
+ assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_"))
add_methods(cls, new_methods=new_methods)
@@ -1575,6 +1632,7 @@ def add_flex_arithmetic_methods(cls):
# -----------------------------------------------------------------------------
# Series
+
def _align_method_SERIES(left, right, align_asobject=False):
""" align lhs and rhs Series """
@@ -1612,10 +1670,8 @@ def _construct_divmod_result(left, result, index, name, dtype=None):
"""divmod returns a tuple of like indexed series instead of a single series.
"""
return (
- _construct_result(left, result[0], index=index, name=name,
- dtype=dtype),
- _construct_result(left, result[1], index=index, name=name,
- dtype=dtype),
+ _construct_result(left, result[0], index=index, name=name, dtype=dtype),
+ _construct_result(left, result[1], index=index, name=name, dtype=dtype),
)
@@ -1628,8 +1684,9 @@ def _arith_method_SERIES(cls, op, special):
op_name = _get_op_name(op, special)
eval_kwargs = _gen_eval_kwargs(op_name)
fill_zeros = _gen_fill_zeros(op_name)
- construct_result = (_construct_divmod_result
- if op in [divmod, rdivmod] else _construct_result)
+ construct_result = (
+ _construct_divmod_result if op in [divmod, rdivmod] else _construct_result
+ )
def na_op(x, y):
"""
@@ -1651,21 +1708,20 @@ def na_op(x, y):
TypeError : invalid operation
"""
import pandas.core.computation.expressions as expressions
+
try:
result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs)
except TypeError:
result = masked_arith_op(x, y, op)
except Exception: # TODO: more specific?
if is_object_dtype(x):
- return libalgos.arrmap_object(x,
- lambda val: op(val, y))
+ return libalgos.arrmap_object(x, lambda val: op(val, y))
raise
if isinstance(result, tuple):
# e.g. divmod
result = tuple(
- missing.fill_zeros(r, x, y, op_name, fill_zeros)
- for r in result
+ missing.fill_zeros(r, x, y, op_name, fill_zeros) for r in result
)
else:
result = missing.fill_zeros(result, x, y, op_name, fill_zeros)
@@ -1680,27 +1736,29 @@ def wrapper(left, right):
right = maybe_upcast_for_op(right)
if is_categorical_dtype(left):
- raise TypeError("{typ} cannot perform the operation "
- "{op}".format(typ=type(left).__name__, op=str_rep))
+ raise TypeError(
+ "{typ} cannot perform the operation "
+ "{op}".format(typ=type(left).__name__, op=str_rep)
+ )
elif is_datetime64_dtype(left) or is_datetime64tz_dtype(left):
# Give dispatch_to_index_op a chance for tests like
# test_dt64_series_add_intlike, which the index dispatching handles
# specifically.
result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex)
- return construct_result(left, result,
- index=left.index, name=res_name,
- dtype=result.dtype)
+ return construct_result(
+ left, result, index=left.index, name=res_name, dtype=result.dtype
+ )
- elif (is_extension_array_dtype(left) or
- (is_extension_array_dtype(right) and not is_scalar(right))):
+ elif is_extension_array_dtype(left) or (
+ is_extension_array_dtype(right) and not is_scalar(right)
+ ):
# GH#22378 disallow scalar to exclude e.g. "category", "Int64"
return dispatch_to_extension_op(op, left, right)
elif is_timedelta64_dtype(left):
result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex)
- return construct_result(left, result,
- index=left.index, name=res_name)
+ return construct_result(left, result, index=left.index, name=res_name)
elif is_timedelta64_dtype(right):
# We should only get here with non-scalar or timedelta64('NaT')
@@ -1709,19 +1767,20 @@ def wrapper(left, right):
# that may incorrectly raise TypeError when we
# should get NullFrequencyError
result = op(pd.Index(left), right)
- return construct_result(left, result,
- index=left.index, name=res_name,
- dtype=result.dtype)
+ return construct_result(
+ left, result, index=left.index, name=res_name, dtype=result.dtype
+ )
lvalues = left.values
rvalues = right
if isinstance(rvalues, ABCSeries):
rvalues = rvalues.values
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = na_op(lvalues, rvalues)
- return construct_result(left, result,
- index=left.index, name=res_name, dtype=None)
+ return construct_result(
+ left, result, index=left.index, name=res_name, dtype=None
+ )
wrapper.__name__ = op_name
return wrapper
@@ -1749,7 +1808,7 @@ def _comp_method_SERIES(cls, op, special):
code duplication.
"""
op_name = _get_op_name(op, special)
- masker = _gen_eval_kwargs(op_name).get('masker', False)
+ masker = _gen_eval_kwargs(op_name).get("masker", False)
def na_op(x, y):
# TODO:
@@ -1779,12 +1838,12 @@ def na_op(x, y):
mask = None
if not is_scalar(y) and needs_i8_conversion(y):
mask = isna(x) | isna(y)
- y = y.view('i8')
- x = x.view('i8')
+ y = y.view("i8")
+ x = x.view("i8")
method = getattr(x, op_name, None)
if method is not None:
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = method(y)
if result is NotImplemented:
return invalid_comparison(x, y, op)
@@ -1812,21 +1871,20 @@ def wrapper(self, other, axis=None):
return NotImplemented
elif isinstance(other, ABCSeries) and not self._indexed_same(other):
- raise ValueError("Can only compare identically-labeled "
- "Series objects")
+ raise ValueError("Can only compare identically-labeled " "Series objects")
elif is_categorical_dtype(self):
# Dispatch to Categorical implementation; pd.CategoricalIndex
# behavior is non-canonical GH#19513
res_values = dispatch_to_index_op(op, self, other, pd.Categorical)
- return self._constructor(res_values, index=self.index,
- name=res_name)
+ return self._constructor(res_values, index=self.index, name=res_name)
elif is_datetime64_dtype(self) or is_datetime64tz_dtype(self):
# Dispatch to DatetimeIndex to ensure identical
# Series/Index behavior
- if (isinstance(other, datetime.date) and
- not isinstance(other, datetime.datetime)):
+ if isinstance(other, datetime.date) and not isinstance(
+ other, datetime.datetime
+ ):
# https://github.com/pandas-dev/pandas/issues/21152
# Compatibility for difference between Series comparison w/
# datetime and date
@@ -1844,27 +1902,23 @@ def wrapper(self, other, axis=None):
future = "a TypeError will be raised"
else:
future = (
- "'the values will not compare equal to the "
- "'datetime.date'"
+ "'the values will not compare equal to the " "'datetime.date'"
)
- msg = '\n'.join(textwrap.wrap(msg.format(future=future)))
+ msg = "\n".join(textwrap.wrap(msg.format(future=future)))
warnings.warn(msg, FutureWarning, stacklevel=2)
other = pd.Timestamp(other)
- res_values = dispatch_to_index_op(op, self, other,
- pd.DatetimeIndex)
+ res_values = dispatch_to_index_op(op, self, other, pd.DatetimeIndex)
- return self._constructor(res_values, index=self.index,
- name=res_name)
+ return self._constructor(res_values, index=self.index, name=res_name)
elif is_timedelta64_dtype(self):
- res_values = dispatch_to_index_op(op, self, other,
- pd.TimedeltaIndex)
- return self._constructor(res_values, index=self.index,
- name=res_name)
+ res_values = dispatch_to_index_op(op, self, other, pd.TimedeltaIndex)
+ return self._constructor(res_values, index=self.index, name=res_name)
- elif (is_extension_array_dtype(self) or
- (is_extension_array_dtype(other) and not is_scalar(other))):
+ elif is_extension_array_dtype(self) or (
+ is_extension_array_dtype(other) and not is_scalar(other)
+ ):
# Note: the `not is_scalar(other)` condition rules out
# e.g. other == "category"
return dispatch_to_extension_op(op, self, other)
@@ -1874,14 +1928,15 @@ def wrapper(self, other, axis=None):
res_values = na_op(self.values, other.values)
# rename is needed in case res_name is None and res_values.name
# is not.
- return self._constructor(res_values, index=self.index,
- name=res_name).rename(res_name)
+ return self._constructor(
+ res_values, index=self.index, name=res_name
+ ).rename(res_name)
elif isinstance(other, (np.ndarray, pd.Index)):
# do not check length of zerodim array
# as it will broadcast
if other.ndim != 0 and len(self) != len(other):
- raise ValueError('Lengths must match to compare')
+ raise ValueError("Lengths must match to compare")
res_values = na_op(self.values, np.asarray(other))
result = self._constructor(res_values, index=self.index)
@@ -1895,22 +1950,25 @@ def wrapper(self, other, axis=None):
res_values = np.ones(len(self), dtype=bool)
else:
res_values = np.zeros(len(self), dtype=bool)
- return self._constructor(res_values, index=self.index,
- name=res_name, dtype='bool')
+ return self._constructor(
+ res_values, index=self.index, name=res_name, dtype="bool"
+ )
else:
values = self.to_numpy()
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
res = na_op(values, other)
if is_scalar(res):
- raise TypeError('Could not compare {typ} type with Series'
- .format(typ=type(other)))
+ raise TypeError(
+ "Could not compare {typ} type with Series".format(typ=type(other))
+ )
# always return a full value series here
res_values = com.values_from_object(res)
- return self._constructor(res_values, index=self.index,
- name=res_name, dtype='bool')
+ return self._constructor(
+ res_values, index=self.index, name=res_name, dtype="bool"
+ )
wrapper.__name__ = op_name
return wrapper
@@ -1941,12 +1999,19 @@ def na_op(x, y):
y = bool(y)
try:
result = libops.scalar_binop(x, y, op)
- except (TypeError, ValueError, AttributeError,
- OverflowError, NotImplementedError):
- raise TypeError("cannot compare a dtyped [{dtype}] array "
- "with a scalar of type [{typ}]"
- .format(dtype=x.dtype,
- typ=type(y).__name__))
+ except (
+ TypeError,
+ ValueError,
+ AttributeError,
+ OverflowError,
+ NotImplementedError,
+ ):
+ raise TypeError(
+ "cannot compare a dtyped [{dtype}] array "
+ "with a scalar of type [{typ}]".format(
+ dtype=x.dtype, typ=type(y).__name__
+ )
+ )
return result
@@ -1984,11 +2049,9 @@ def wrapper(self, other):
# For int vs int `^`, `|`, `&` are bitwise operators and return
# integer dtypes. Otherwise these are boolean ops
- filler = (fill_int if is_self_int_dtype and is_other_int_dtype
- else fill_bool)
+ filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool
res_values = na_op(self.values, ovalues)
- unfilled = self._constructor(res_values,
- index=self.index, name=res_name)
+ unfilled = self._constructor(res_values, index=self.index, name=res_name)
filled = filler(unfilled)
return finalizer(filled)
@@ -1998,7 +2061,7 @@ def wrapper(self, other):
def _flex_method_SERIES(cls, op, special):
name = _get_op_name(op, special)
- doc = _make_flex_doc(name, 'series')
+ doc = _make_flex_doc(name, "series")
@Appender(doc)
def flex_wrapper(self, other, level=None, fill_value=None, axis=0):
@@ -2009,15 +2072,14 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0):
return self._binop(other, op, level=level, fill_value=fill_value)
elif isinstance(other, (np.ndarray, list, tuple)):
if len(other) != len(self):
- raise ValueError('Lengths must be equal')
+ raise ValueError("Lengths must be equal")
other = self._constructor(other, self.index)
return self._binop(other, op, level=level, fill_value=fill_value)
else:
if fill_value is not None:
self = self.fillna(fill_value)
- return self._constructor(op(self, other),
- self.index).__finalize__(self)
+ return self._constructor(op(self, other), self.index).__finalize__(self)
flex_wrapper.__name__ = name
return flex_wrapper
@@ -2027,8 +2089,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0):
# DataFrame
-def _combine_series_frame(self, other, func, fill_value=None, axis=None,
- level=None):
+def _combine_series_frame(self, other, func, fill_value=None, axis=None, level=None):
"""
Apply binary operator `func` to self, other using alignment and fill
conventions determined by the fill_value, axis, and level kwargs.
@@ -2047,8 +2108,9 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None,
result : DataFrame
"""
if fill_value is not None:
- raise NotImplementedError("fill_value {fill} not supported."
- .format(fill=fill_value))
+ raise NotImplementedError(
+ "fill_value {fill} not supported.".format(fill=fill_value)
+ )
if axis is not None:
axis = self._get_axis_number(axis)
@@ -2062,8 +2124,9 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None,
if not len(self):
# Ambiguous case, use _series so works with DataFrame
- return self._constructor(data=self._series, index=self.index,
- columns=self.columns)
+ return self._constructor(
+ data=self._series, index=self.index, columns=self.columns
+ )
# default axis is columns
return self._combine_match_columns(other, func, level=level)
@@ -2073,17 +2136,20 @@ def _align_method_FRAME(left, right, axis):
""" convert rhs to meet lhs dims if input is list, tuple or np.ndarray """
def to_series(right):
- msg = ('Unable to coerce to Series, length must be {req_len}: '
- 'given {given_len}')
- if axis is not None and left._get_axis_name(axis) == 'index':
+ msg = (
+ "Unable to coerce to Series, length must be {req_len}: " "given {given_len}"
+ )
+ if axis is not None and left._get_axis_name(axis) == "index":
if len(left.index) != len(right):
- raise ValueError(msg.format(req_len=len(left.index),
- given_len=len(right)))
+ raise ValueError(
+ msg.format(req_len=len(left.index), given_len=len(right))
+ )
right = left._constructor_sliced(right, index=left.index)
else:
if len(left.columns) != len(right):
- raise ValueError(msg.format(req_len=len(left.columns),
- given_len=len(right)))
+ raise ValueError(
+ msg.format(req_len=len(left.columns), given_len=len(right))
+ )
right = left._constructor_sliced(right, index=left.columns)
return right
@@ -2094,32 +2160,32 @@ def to_series(right):
elif right.ndim == 2:
if right.shape == left.shape:
- right = left._constructor(right, index=left.index,
- columns=left.columns)
+ right = left._constructor(right, index=left.index, columns=left.columns)
elif right.shape[0] == left.shape[0] and right.shape[1] == 1:
# Broadcast across columns
right = np.broadcast_to(right, left.shape)
- right = left._constructor(right,
- index=left.index,
- columns=left.columns)
+ right = left._constructor(right, index=left.index, columns=left.columns)
elif right.shape[1] == left.shape[1] and right.shape[0] == 1:
# Broadcast along rows
right = to_series(right[0, :])
else:
- raise ValueError("Unable to coerce to DataFrame, shape "
- "must be {req_shape}: given {given_shape}"
- .format(req_shape=left.shape,
- given_shape=right.shape))
+ raise ValueError(
+ "Unable to coerce to DataFrame, shape "
+ "must be {req_shape}: given {given_shape}".format(
+ req_shape=left.shape, given_shape=right.shape
+ )
+ )
elif right.ndim > 2:
- raise ValueError('Unable to coerce to Series/DataFrame, dim '
- 'must be <= 2: {dim}'.format(dim=right.shape))
+ raise ValueError(
+ "Unable to coerce to Series/DataFrame, dim "
+ "must be <= 2: {dim}".format(dim=right.shape)
+ )
- elif (is_list_like(right) and
- not isinstance(right, (ABCSeries, ABCDataFrame))):
+ elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)):
# GH17901
right = to_series(right)
@@ -2146,7 +2212,7 @@ def na_op(x, y):
if op_name in _op_descriptions:
# i.e. include "add" but not "__add__"
- doc = _make_flex_doc(op_name, 'dataframe')
+ doc = _make_flex_doc(op_name, "dataframe")
else:
doc = _arith_doc_FRAME % op_name
@@ -2163,9 +2229,9 @@ def f(self, other, axis=default_axis, level=None, fill_value=None):
# For these values of `axis`, we end up dispatching to Series op,
# so do not want the masked op.
pass_op = op if axis in [0, "columns", None] else na_op
- return _combine_series_frame(self, other, pass_op,
- fill_value=fill_value, axis=axis,
- level=level)
+ return _combine_series_frame(
+ self, other, pass_op, fill_value=fill_value, axis=axis, level=level
+ )
else:
if fill_value is not None:
self = self.fillna(fill_value)
@@ -2185,14 +2251,15 @@ def _flex_comp_method_FRAME(cls, op, special):
def na_op(x, y):
try:
- with np.errstate(invalid='ignore'):
+ with np.errstate(invalid="ignore"):
result = op(x, y)
except TypeError:
result = mask_cmp_op(x, y, op)
return result
- doc = _flex_comp_doc_FRAME.format(op_name=op_name,
- desc=_op_descriptions[op_name]['desc'])
+ doc = _flex_comp_doc_FRAME.format(
+ op_name=op_name, desc=_op_descriptions[op_name]["desc"]
+ )
@Appender(doc)
def f(self, other, axis=default_axis, level=None):
@@ -2202,14 +2269,13 @@ def f(self, other, axis=default_axis, level=None):
if isinstance(other, ABCDataFrame):
# Another DataFrame
if not self._indexed_same(other):
- self, other = self.align(other, 'outer',
- level=level, copy=False)
+ self, other = self.align(other, "outer", level=level, copy=False)
return dispatch_to_series(self, other, na_op, str_rep)
elif isinstance(other, ABCSeries):
- return _combine_series_frame(self, other, na_op,
- fill_value=None, axis=axis,
- level=level)
+ return _combine_series_frame(
+ self, other, na_op, fill_value=None, axis=axis, level=level
+ )
else:
assert np.ndim(other) == 0, other
return self._combine_const(other, na_op)
@@ -2223,7 +2289,7 @@ def _comp_method_FRAME(cls, func, special):
str_rep = _get_opstr(func, cls)
op_name = _get_op_name(func, special)
- @Appender('Wrapper for comparison method {name}'.format(name=op_name))
+ @Appender("Wrapper for comparison method {name}".format(name=op_name))
def f(self, other):
other = _align_method_FRAME(self, other, axis=None)
@@ -2231,14 +2297,15 @@ def f(self, other):
if isinstance(other, ABCDataFrame):
# Another DataFrame
if not self._indexed_same(other):
- raise ValueError('Can only compare identically-labeled '
- 'DataFrame objects')
+ raise ValueError(
+ "Can only compare identically-labeled " "DataFrame objects"
+ )
return dispatch_to_series(self, other, func, str_rep)
elif isinstance(other, ABCSeries):
- return _combine_series_frame(self, other, func,
- fill_value=None, axis=None,
- level=None)
+ return _combine_series_frame(
+ self, other, func, fill_value=None, axis=None, level=None
+ )
else:
# straight boolean comparisons we want to allow all columns
@@ -2254,6 +2321,7 @@ def f(self, other):
# -----------------------------------------------------------------------------
# Sparse
+
def _cast_sparse_series_op(left, right, opname):
"""
For SparseSeries operation, coerce to float64 if the result is expected
@@ -2272,15 +2340,15 @@ def _cast_sparse_series_op(left, right, opname):
"""
from pandas.core.sparse.api import SparseDtype
- opname = opname.strip('_')
+ opname = opname.strip("_")
# TODO: This should be moved to the array?
if is_integer_dtype(left) and is_integer_dtype(right):
# series coerces to float64 if result should have NaN/inf
- if opname in ('floordiv', 'mod') and (right.to_dense() == 0).any():
+ if opname in ("floordiv", "mod") and (right.to_dense() == 0).any():
left = left.astype(SparseDtype(np.float64, left.fill_value))
right = right.astype(SparseDtype(np.float64, right.fill_value))
- elif opname in ('rfloordiv', 'rmod') and (left.to_dense() == 0).any():
+ elif opname in ("rfloordiv", "rmod") and (left.to_dense() == 0).any():
left = left.astype(SparseDtype(np.float64, left.fill_value))
right = right.astype(SparseDtype(np.float64, right.fill_value))
@@ -2302,25 +2370,25 @@ def wrapper(self, other):
other = other.to_sparse(fill_value=self.fill_value)
return _sparse_series_op(self, other, op, op_name)
elif is_scalar(other):
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
new_values = op(self.values, other)
- return self._constructor(new_values,
- index=self.index,
- name=self.name)
+ return self._constructor(new_values, index=self.index, name=self.name)
else: # pragma: no cover
- raise TypeError('operation with {other} not supported'
- .format(other=type(other)))
+ raise TypeError(
+ "operation with {other} not supported".format(other=type(other))
+ )
wrapper.__name__ = op_name
return wrapper
def _sparse_series_op(left, right, op, name):
- left, right = left.align(right, join='outer', copy=False)
+ left, right = left.align(right, join="outer", copy=False)
new_index = left.index
new_name = get_op_result_name(left, right)
from pandas.core.arrays.sparse import _sparse_array_op
+
lvalues, rvalues = _cast_sparse_series_op(left.values, right.values, name)
result = _sparse_array_op(lvalues, rvalues, op, name)
return left._constructor(result, index=new_index, name=new_name)
@@ -2335,36 +2403,40 @@ def _arith_method_SPARSE_ARRAY(cls, op, special):
def wrapper(self, other):
from pandas.core.arrays.sparse.array import (
- SparseArray, _sparse_array_op, _wrap_result, _get_fill)
+ SparseArray,
+ _sparse_array_op,
+ _wrap_result,
+ _get_fill,
+ )
+
if isinstance(other, np.ndarray):
if len(self) != len(other):
- raise AssertionError("length mismatch: {self} vs. {other}"
- .format(self=len(self), other=len(other)))
+ raise AssertionError(
+ "length mismatch: {self} vs. {other}".format(
+ self=len(self), other=len(other)
+ )
+ )
if not isinstance(other, SparseArray):
- dtype = getattr(other, 'dtype', None)
- other = SparseArray(other, fill_value=self.fill_value,
- dtype=dtype)
+ dtype = getattr(other, "dtype", None)
+ other = SparseArray(other, fill_value=self.fill_value, dtype=dtype)
return _sparse_array_op(self, other, op, op_name)
elif is_scalar(other):
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
fill = op(_get_fill(self), np.asarray(other))
result = op(self.sp_values, other)
return _wrap_result(op_name, result, self.sp_index, fill)
else: # pragma: no cover
- raise TypeError('operation with {other} not supported'
- .format(other=type(other)))
+ raise TypeError(
+ "operation with {other} not supported".format(other=type(other))
+ )
wrapper.__name__ = op_name
return wrapper
def maybe_dispatch_ufunc_to_dunder_op(
- self: ArrayLike,
- ufunc: Callable,
- method: str,
- *inputs: ArrayLike,
- **kwargs: Any
+ self: ArrayLike, ufunc: Callable, method: str, *inputs: ArrayLike, **kwargs: Any
):
"""
Dispatch a ufunc to the equivalent dunder method.
@@ -2387,33 +2459,48 @@ def maybe_dispatch_ufunc_to_dunder_op(
The result of applying the ufunc
"""
# special has the ufuncs we dispatch to the dunder op on
- special = {'add', 'sub', 'mul', 'pow', 'mod', 'floordiv', 'truediv',
- 'divmod', 'eq', 'ne', 'lt', 'gt', 'le', 'ge', 'remainder',
- 'matmul'}
+ special = {
+ "add",
+ "sub",
+ "mul",
+ "pow",
+ "mod",
+ "floordiv",
+ "truediv",
+ "divmod",
+ "eq",
+ "ne",
+ "lt",
+ "gt",
+ "le",
+ "ge",
+ "remainder",
+ "matmul",
+ }
aliases = {
- 'subtract': 'sub',
- 'multiply': 'mul',
- 'floor_divide': 'floordiv',
- 'true_divide': 'truediv',
- 'power': 'pow',
- 'remainder': 'mod',
- 'divide': 'div',
- 'equal': 'eq',
- 'not_equal': 'ne',
- 'less': 'lt',
- 'less_equal': 'le',
- 'greater': 'gt',
- 'greater_equal': 'ge',
+ "subtract": "sub",
+ "multiply": "mul",
+ "floor_divide": "floordiv",
+ "true_divide": "truediv",
+ "power": "pow",
+ "remainder": "mod",
+ "divide": "div",
+ "equal": "eq",
+ "not_equal": "ne",
+ "less": "lt",
+ "less_equal": "le",
+ "greater": "gt",
+ "greater_equal": "ge",
}
# For op(., Array) -> Array.__r{op}__
flipped = {
- 'lt': '__gt__',
- 'le': '__ge__',
- 'gt': '__lt__',
- 'ge': '__le__',
- 'eq': '__eq__',
- 'ne': '__ne__',
+ "lt": "__gt__",
+ "le": "__ge__",
+ "gt": "__lt__",
+ "ge": "__le__",
+ "eq": "__eq__",
+ "ne": "__ne__",
}
op_name = ufunc.__name__
@@ -2422,13 +2509,12 @@ def maybe_dispatch_ufunc_to_dunder_op(
def not_implemented(*args, **kwargs):
return NotImplemented
- if (method == '__call__' and op_name in special
- and kwargs.get('out') is None):
+ if method == "__call__" and op_name in special and kwargs.get("out") is None:
if isinstance(inputs[0], type(self)):
- name = '__{}__'.format(op_name)
+ name = "__{}__".format(op_name)
return getattr(self, name, not_implemented)(inputs[1])
else:
- name = flipped.get(op_name, '__r{}__'.format(op_name))
+ name = flipped.get(op_name, "__r{}__".format(op_name))
return getattr(self, name, not_implemented)(inputs[0])
else:
return NotImplemented
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 632b5a9c5e002..b4a3e6ed71bf4 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -21,8 +21,7 @@
from pandas.core.generic import _shared_docs
from pandas.core.groupby.base import GroupByMixin
from pandas.core.groupby.generic import SeriesGroupBy
-from pandas.core.groupby.groupby import (
- GroupBy, _GroupBy, _pipe_template, groupby)
+from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, groupby
from pandas.core.groupby.grouper import Grouper
from pandas.core.groupby.ops import BinGrouper
from pandas.core.indexes.datetimes import DatetimeIndex, date_range
@@ -60,8 +59,16 @@ class Resampler(_GroupBy):
"""
# to the groupby descriptor
- _attributes = ['freq', 'axis', 'closed', 'label', 'convention',
- 'loffset', 'base', 'kind']
+ _attributes = [
+ "freq",
+ "axis",
+ "closed",
+ "label",
+ "convention",
+ "loffset",
+ "base",
+ "kind",
+ ]
def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs):
self.groupby = groupby
@@ -83,11 +90,14 @@ def __str__(self):
"""
Provide a nice str repr of our rolling object.
"""
- attrs = ("{k}={v}".format(k=k, v=getattr(self.groupby, k))
- for k in self._attributes if
- getattr(self.groupby, k, None) is not None)
- return "{klass} [{attrs}]".format(klass=self.__class__.__name__,
- attrs=', '.join(attrs))
+ attrs = (
+ "{k}={v}".format(k=k, v=getattr(self.groupby, k))
+ for k in self._attributes
+ if getattr(self.groupby, k, None) is not None
+ )
+ return "{klass} [{attrs}]".format(
+ klass=self.__class__.__name__, attrs=", ".join(attrs)
+ )
def __getattr__(self, attr):
if attr in self._internal_names_set:
@@ -129,8 +139,8 @@ def _typ(self):
Masquerade for compat as a Series or a DataFrame.
"""
if isinstance(self._selected_obj, pd.Series):
- return 'series'
- return 'dataframe'
+ return "series"
+ return "dataframe"
@property
def _from_selection(self):
@@ -139,9 +149,9 @@ def _from_selection(self):
"""
# upsampling and PeriodIndex resampling do not work
# with selection, this state used to catch and raise an error
- return (self.groupby is not None and
- (self.groupby.key is not None or
- self.groupby.level is not None))
+ return self.groupby is not None and (
+ self.groupby.key is not None or self.groupby.level is not None
+ )
def _convert_obj(self, obj):
"""
@@ -186,9 +196,10 @@ def _assure_grouper(self):
"""
self._set_binner()
- @Substitution(klass='Resampler',
- versionadded='.. versionadded:: 0.23.0',
- examples="""
+ @Substitution(
+ klass="Resampler",
+ versionadded=".. versionadded:: 0.23.0",
+ examples="""
>>> df = pd.DataFrame({'A': [1, 2, 3, 4]},
... index=pd.date_range('2012-08-02', periods=4))
>>> df
@@ -204,20 +215,24 @@ def _assure_grouper(self):
>>> df.resample('2D').pipe(lambda x: x.max() - x.min())
A
2012-08-02 1
- 2012-08-04 1""")
+ 2012-08-04 1""",
+ )
@Appender(_pipe_template)
def pipe(self, func, *args, **kwargs):
return super().pipe(func, *args, **kwargs)
- _agg_see_also_doc = dedent("""
+ _agg_see_also_doc = dedent(
+ """
See Also
--------
DataFrame.groupby.aggregate
DataFrame.resample.transform
DataFrame.aggregate
- """)
+ """
+ )
- _agg_examples_doc = dedent("""
+ _agg_examples_doc = dedent(
+ """
Examples
--------
>>> s = pd.Series([1,2,3,4,5],
@@ -251,14 +266,17 @@ def pipe(self, func, *args, **kwargs):
2013-01-01 00:00:00 3 2.121320
2013-01-01 00:00:02 7 4.949747
2013-01-01 00:00:04 5 NaN
- """)
-
- @Substitution(see_also=_agg_see_also_doc,
- examples=_agg_examples_doc,
- versionadded='',
- klass='DataFrame',
- axis='')
- @Appender(_shared_docs['aggregate'])
+ """
+ )
+
+ @Substitution(
+ see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded="",
+ klass="DataFrame",
+ axis="",
+ )
+ @Appender(_shared_docs["aggregate"])
def aggregate(self, func, *args, **kwargs):
self._set_binner()
@@ -266,10 +284,7 @@ def aggregate(self, func, *args, **kwargs):
if result is None:
how = func
grouper = None
- result = self._groupby_and_aggregate(how,
- grouper,
- *args,
- **kwargs)
+ result = self._groupby_and_aggregate(how, grouper, *args, **kwargs)
result = self._apply_loffset(result)
return result
@@ -295,8 +310,7 @@ def transform(self, arg, *args, **kwargs):
--------
>>> resampled.transform(lambda x: (x - x.mean()) / x.std())
"""
- return self._selected_obj.groupby(self.groupby).transform(
- arg, *args, **kwargs)
+ return self._selected_obj.groupby(self.groupby).transform(arg, *args, **kwargs)
def _downsample(self, f):
raise AbstractMethodError(self)
@@ -370,10 +384,9 @@ def _apply_loffset(self, result):
"""
needs_offset = (
- isinstance(self.loffset, (DateOffset, timedelta,
- np.timedelta64)) and
- isinstance(result.index, DatetimeIndex) and
- len(result.index) > 0
+ isinstance(self.loffset, (DateOffset, timedelta, np.timedelta64))
+ and isinstance(result.index, DatetimeIndex)
+ and len(result.index) > 0
)
if needs_offset:
@@ -401,7 +414,7 @@ def _wrap_result(self, result):
result.index = obj.index.asfreq(self.freq)
else:
result.index = obj.index._shallow_copy(freq=self.freq)
- result.name = getattr(obj, 'name', None)
+ result.name = getattr(obj, "name", None)
return result
@@ -423,7 +436,8 @@ def pad(self, limit=None):
Series.fillna
DataFrame.fillna
"""
- return self._upsample('pad', limit=limit)
+ return self._upsample("pad", limit=limit)
+
ffill = pad
def nearest(self, limit=None):
@@ -486,7 +500,7 @@ def nearest(self, limit=None):
2018-01-01 01:00:00 2.0
Freq: 15T, dtype: float64
"""
- return self._upsample('nearest', limit=limit)
+ return self._upsample("nearest", limit=limit)
def backfill(self, limit=None):
"""
@@ -589,7 +603,8 @@ def backfill(self, limit=None):
2018-01-01 01:45:00 6.0 5.0
2018-01-01 02:00:00 6.0 5.0
"""
- return self._upsample('backfill', limit=limit)
+ return self._upsample("backfill", limit=limit)
+
bfill = backfill
def fillna(self, method, limit=None):
@@ -752,21 +767,34 @@ def fillna(self, method, limit=None):
"""
return self._upsample(method, limit=limit)
- @Appender(_shared_docs['interpolate'] % _shared_docs_kwargs)
- def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
- limit_direction='forward', limit_area=None,
- downcast=None, **kwargs):
+ @Appender(_shared_docs["interpolate"] % _shared_docs_kwargs)
+ def interpolate(
+ self,
+ method="linear",
+ axis=0,
+ limit=None,
+ inplace=False,
+ limit_direction="forward",
+ limit_area=None,
+ downcast=None,
+ **kwargs
+ ):
"""
Interpolate values according to different methods.
.. versionadded:: 0.18.1
"""
result = self._upsample(None)
- return result.interpolate(method=method, axis=axis, limit=limit,
- inplace=inplace,
- limit_direction=limit_direction,
- limit_area=limit_area,
- downcast=downcast, **kwargs)
+ return result.interpolate(
+ method=method,
+ axis=axis,
+ limit=limit,
+ inplace=inplace,
+ limit_direction=limit_direction,
+ limit_area=limit_area,
+ downcast=downcast,
+ **kwargs
+ )
def asfreq(self, fill_value=None):
"""
@@ -790,7 +818,7 @@ def asfreq(self, fill_value=None):
Series.asfreq
DataFrame.asfreq
"""
- return self._upsample('asfreq', fill_value=fill_value)
+ return self._upsample("asfreq", fill_value=fill_value)
def std(self, ddof=1, *args, **kwargs):
"""
@@ -806,8 +834,8 @@ def std(self, ddof=1, *args, **kwargs):
DataFrame or Series
Standard deviation of values within each group.
"""
- nv.validate_resampler_func('std', args, kwargs)
- return self._downsample('std', ddof=ddof)
+ nv.validate_resampler_func("std", args, kwargs)
+ return self._downsample("std", ddof=ddof)
def var(self, ddof=1, *args, **kwargs):
"""
@@ -823,16 +851,16 @@ def var(self, ddof=1, *args, **kwargs):
DataFrame or Series
Variance of values within each group.
"""
- nv.validate_resampler_func('var', args, kwargs)
- return self._downsample('var', ddof=ddof)
+ nv.validate_resampler_func("var", args, kwargs)
+ return self._downsample("var", ddof=ddof)
@Appender(GroupBy.size.__doc__)
def size(self):
# It's a special case as higher level does return
# a copy of 0-len objects. GH14962
- result = self._downsample('size')
+ result = self._downsample("size")
if not len(self.ax) and isinstance(self._selected_obj, ABCDataFrame):
- result = pd.Series([], index=result.index, dtype='int64')
+ result = pd.Series([], index=result.index, dtype="int64")
return result
def quantile(self, q=0.5, **kwargs):
@@ -856,40 +884,45 @@ def quantile(self, q=0.5, **kwargs):
DataFrame.quantile
DataFrameGroupBy.quantile
"""
- return self._downsample('quantile', q=q, **kwargs)
+ return self._downsample("quantile", q=q, **kwargs)
# downsample methods
-for method in ['sum', 'prod']:
+for method in ["sum", "prod"]:
def f(self, _method=method, min_count=0, *args, **kwargs):
nv.validate_resampler_func(_method, args, kwargs)
return self._downsample(_method, min_count=min_count)
+
f.__doc__ = getattr(GroupBy, method).__doc__
setattr(Resampler, method, f)
# downsample methods
-for method in ['min', 'max', 'first', 'last', 'mean', 'sem',
- 'median', 'ohlc']:
+for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]:
def g(self, _method=method, *args, **kwargs):
nv.validate_resampler_func(_method, args, kwargs)
return self._downsample(_method)
+
g.__doc__ = getattr(GroupBy, method).__doc__
setattr(Resampler, method, g)
# groupby & aggregate methods
-for method in ['count']:
+for method in ["count"]:
+
def h(self, _method=method):
return self._downsample(_method)
+
h.__doc__ = getattr(GroupBy, method).__doc__
setattr(Resampler, method, h)
# series only methods
-for method in ['nunique']:
+for method in ["nunique"]:
+
def h(self, _method=method):
return self._downsample(_method)
+
h.__doc__ = getattr(SeriesGroupBy, method).__doc__
setattr(Resampler, method, h)
@@ -913,26 +946,30 @@ def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None):
# if we have both a how and fill_method, then show
# the following warning
if fill_method is None:
- warnings.warn("how in .resample() is deprecated\n"
- "the new syntax is "
- ".resample(...).{method}".format(
- method=method),
- FutureWarning, stacklevel=3)
+ warnings.warn(
+ "how in .resample() is deprecated\n"
+ "the new syntax is "
+ ".resample(...).{method}".format(method=method),
+ FutureWarning,
+ stacklevel=3,
+ )
r = r.aggregate(how)
if fill_method is not None:
# show the prior function call
- method = '.' + method if how is not None else ''
+ method = "." + method if how is not None else ""
args = "limit={0}".format(limit) if limit is not None else ""
- warnings.warn("fill_method is deprecated to .resample()\n"
- "the new syntax is .resample(...){method}"
- ".{fill_method}({args})".format(
- method=method,
- fill_method=fill_method,
- args=args),
- FutureWarning, stacklevel=3)
+ warnings.warn(
+ "fill_method is deprecated to .resample()\n"
+ "the new syntax is .resample(...){method}"
+ ".{fill_method}({args})".format(
+ method=method, fill_method=fill_method, args=args
+ ),
+ FutureWarning,
+ stacklevel=3,
+ )
if how is not None:
r = getattr(r, fill_method)(limit=limit)
@@ -946,10 +983,11 @@ class _GroupByMixin(GroupByMixin):
"""
Provide the groupby facilities.
"""
+
def __init__(self, obj, *args, **kwargs):
- parent = kwargs.pop('parent', None)
- groupby = kwargs.pop('groupby', None)
+ parent = kwargs.pop("parent", None)
+ groupby = kwargs.pop("groupby", None)
if parent is None:
parent = obj
@@ -988,7 +1026,6 @@ def func(x):
class DatetimeIndexResampler(Resampler):
-
@property
def _resampler_for_grouping(self):
return DatetimeIndexResamplerGroupby
@@ -996,7 +1033,7 @@ def _resampler_for_grouping(self):
def _get_binner_for_time(self):
# this is how we are actually creating the bins
- if self.kind == 'period':
+ if self.kind == "period":
return self.groupby._get_time_period_bins(self.ax)
return self.groupby._get_time_bins(self.ax)
@@ -1030,8 +1067,7 @@ def _downsample(self, how, **kwargs):
# we are downsampling
# we want to call the actual grouper method here
- result = obj.groupby(
- self.grouper, axis=self.axis).aggregate(how, **kwargs)
+ result = obj.groupby(self.grouper, axis=self.axis).aggregate(how, **kwargs)
result = self._apply_loffset(result)
return self._wrap_result(result)
@@ -1042,7 +1078,7 @@ def _adjust_binner_for_upsample(self, binner):
The range of a new index should not be outside specified range
"""
- if self.closed == 'right':
+ if self.closed == "right":
binner = binner[1:]
else:
binner = binner[:-1]
@@ -1066,12 +1102,14 @@ def _upsample(self, method, limit=None, fill_value=None):
"""
self._set_binner()
if self.axis:
- raise AssertionError('axis must be 0')
+ raise AssertionError("axis must be 0")
if self._from_selection:
- raise ValueError("Upsampling from level= or on= selection"
- " is not supported, use .set_index(...)"
- " to explicitly set index to"
- " datetime-like")
+ raise ValueError(
+ "Upsampling from level= or on= selection"
+ " is not supported, use .set_index(...)"
+ " to explicitly set index to"
+ " datetime-like"
+ )
ax = self.ax
obj = self._selected_obj
@@ -1083,8 +1121,9 @@ def _upsample(self, method, limit=None, fill_value=None):
result = obj.copy()
result.index = res_index
else:
- result = obj.reindex(res_index, method=method,
- limit=limit, fill_value=fill_value)
+ result = obj.reindex(
+ res_index, method=method, limit=limit, fill_value=fill_value
+ )
result = self._apply_loffset(result)
return self._wrap_result(result)
@@ -1094,7 +1133,7 @@ def _wrap_result(self, result):
# we may have a different kind that we were asked originally
# convert if needed
- if self.kind == 'period' and not isinstance(result.index, PeriodIndex):
+ if self.kind == "period" and not isinstance(result.index, PeriodIndex):
result.index = result.index.to_period(self.freq)
return result
@@ -1105,19 +1144,19 @@ class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler):
.. versionadded:: 0.18.1
"""
+
@property
def _constructor(self):
return DatetimeIndexResampler
class PeriodIndexResampler(DatetimeIndexResampler):
-
@property
def _resampler_for_grouping(self):
return PeriodIndexResamplerGroupby
def _get_binner_for_time(self):
- if self.kind == 'timestamp':
+ if self.kind == "timestamp":
return super()._get_binner_for_time()
return self.groupby._get_period_bins(self.ax)
@@ -1126,18 +1165,20 @@ def _convert_obj(self, obj):
if self._from_selection:
# see GH 14008, GH 12871
- msg = ("Resampling from level= or on= selection"
- " with a PeriodIndex is not currently supported,"
- " use .set_index(...) to explicitly set index")
+ msg = (
+ "Resampling from level= or on= selection"
+ " with a PeriodIndex is not currently supported,"
+ " use .set_index(...) to explicitly set index"
+ )
raise NotImplementedError(msg)
if self.loffset is not None:
# Cannot apply loffset/timedelta to PeriodIndex -> convert to
# timestamps
- self.kind = 'timestamp'
+ self.kind = "timestamp"
# convert to timestamp
- if self.kind == 'timestamp':
+ if self.kind == "timestamp":
obj = obj.to_timestamp(how=self.convention)
return obj
@@ -1153,7 +1194,7 @@ def _downsample(self, how, **kwargs):
"""
# we may need to actually resample as if we are timestamps
- if self.kind == 'timestamp':
+ if self.kind == "timestamp":
return super()._downsample(how, **kwargs)
how = self._is_cython_func(how) or how
@@ -1161,10 +1202,9 @@ def _downsample(self, how, **kwargs):
if is_subperiod(ax.freq, self.freq):
# Downsampling
- return self._groupby_and_aggregate(how, grouper=self.grouper,
- **kwargs)
+ return self._groupby_and_aggregate(how, grouper=self.grouper, **kwargs)
elif is_superperiod(ax.freq, self.freq):
- if how == 'ohlc':
+ if how == "ohlc":
# GH #13083
# upsampling to subperiods is handled as an asfreq, which works
# for pure aggregating/reducing methods
@@ -1176,8 +1216,9 @@ def _downsample(self, how, **kwargs):
return self.asfreq()
raise IncompatibleFrequency(
- 'Frequency {} cannot be resampled to {}, as they are not '
- 'sub or super periods'.format(ax.freq, self.freq))
+ "Frequency {} cannot be resampled to {}, as they are not "
+ "sub or super periods".format(ax.freq, self.freq)
+ )
def _upsample(self, method, limit=None, fill_value=None):
"""
@@ -1197,9 +1238,8 @@ def _upsample(self, method, limit=None, fill_value=None):
"""
# we may need to actually resample as if we are timestamps
- if self.kind == 'timestamp':
- return super()._upsample(method, limit=limit,
- fill_value=fill_value)
+ if self.kind == "timestamp":
+ return super()._upsample(method, limit=limit, fill_value=fill_value)
self._set_binner()
ax = self.ax
@@ -1211,8 +1251,9 @@ def _upsample(self, method, limit=None, fill_value=None):
# Get the fill indexer
indexer = memb.get_indexer(new_index, method=method, limit=limit)
- return self._wrap_result(_take_new_index(
- obj, indexer, new_index, axis=self.axis))
+ return self._wrap_result(
+ _take_new_index(obj, indexer, new_index, axis=self.axis)
+ )
class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler):
@@ -1221,13 +1262,13 @@ class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler):
.. versionadded:: 0.18.1
"""
+
@property
def _constructor(self):
return PeriodIndexResampler
class TimedeltaIndexResampler(DatetimeIndexResampler):
-
@property
def _resampler_for_grouping(self):
return TimedeltaIndexResamplerGroupby
@@ -1251,6 +1292,7 @@ class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler):
.. versionadded:: 0.18.1
"""
+
@property
def _constructor(self):
return TimedeltaIndexResampler
@@ -1267,22 +1309,20 @@ def resample(obj, kind=None, **kwds):
resample.__doc__ = Resampler.__doc__
-def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None,
- limit=None, kind=None, **kwargs):
+def get_resampler_for_grouping(
+ groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs
+):
"""
Return our appropriate resampler when grouping as well.
"""
# .resample uses 'on' similar to how .groupby uses 'key'
- kwargs['key'] = kwargs.pop('on', None)
+ kwargs["key"] = kwargs.pop("on", None)
tg = TimeGrouper(freq=rule, **kwargs)
resampler = tg._get_resampler(groupby.obj, kind=kind)
r = resampler._get_resampler_for_grouping(groupby=groupby)
- return _maybe_process_deprecations(r,
- how=how,
- fill_method=fill_method,
- limit=limit)
+ return _maybe_process_deprecations(r, how=how, fill_method=fill_method, limit=limit)
class TimeGrouper(Grouper):
@@ -1297,45 +1337,61 @@ class TimeGrouper(Grouper):
convention : {'start', 'end', 'e', 's'}
If axis is PeriodIndex
"""
- _attributes = Grouper._attributes + ('closed', 'label', 'how',
- 'loffset', 'kind', 'convention',
- 'base')
- def __init__(self, freq='Min', closed=None, label=None, how='mean',
- axis=0, fill_method=None, limit=None, loffset=None,
- kind=None, convention=None, base=0, **kwargs):
+ _attributes = Grouper._attributes + (
+ "closed",
+ "label",
+ "how",
+ "loffset",
+ "kind",
+ "convention",
+ "base",
+ )
+
+ def __init__(
+ self,
+ freq="Min",
+ closed=None,
+ label=None,
+ how="mean",
+ axis=0,
+ fill_method=None,
+ limit=None,
+ loffset=None,
+ kind=None,
+ convention=None,
+ base=0,
+ **kwargs
+ ):
# Check for correctness of the keyword arguments which would
# otherwise silently use the default if misspelled
- if label not in {None, 'left', 'right'}:
- raise ValueError('Unsupported value {} for `label`'.format(label))
- if closed not in {None, 'left', 'right'}:
- raise ValueError('Unsupported value {} for `closed`'.format(
- closed))
- if convention not in {None, 'start', 'end', 'e', 's'}:
- raise ValueError('Unsupported value {} for `convention`'
- .format(convention))
+ if label not in {None, "left", "right"}:
+ raise ValueError("Unsupported value {} for `label`".format(label))
+ if closed not in {None, "left", "right"}:
+ raise ValueError("Unsupported value {} for `closed`".format(closed))
+ if convention not in {None, "start", "end", "e", "s"}:
+ raise ValueError("Unsupported value {} for `convention`".format(convention))
freq = to_offset(freq)
- end_types = {'M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W'}
+ end_types = {"M", "A", "Q", "BM", "BA", "BQ", "W"}
rule = freq.rule_code
- if (rule in end_types or
- ('-' in rule and rule[:rule.find('-')] in end_types)):
+ if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types):
if closed is None:
- closed = 'right'
+ closed = "right"
if label is None:
- label = 'right'
+ label = "right"
else:
if closed is None:
- closed = 'left'
+ closed = "left"
if label is None:
- label = 'left'
+ label = "left"
self.closed = closed
self.label = label
self.kind = kind
- self.convention = convention or 'E'
+ self.convention = convention or "E"
self.convention = self.convention.lower()
if isinstance(loffset, str):
@@ -1348,7 +1404,7 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean',
self.base = base
# always sort time groupers
- kwargs['sort'] = True
+ kwargs["sort"] = True
super().__init__(freq=freq, axis=axis, **kwargs)
@@ -1375,23 +1431,17 @@ def _get_resampler(self, obj, kind=None):
ax = self.ax
if isinstance(ax, DatetimeIndex):
- return DatetimeIndexResampler(obj,
- groupby=self,
- kind=kind,
- axis=self.axis)
- elif isinstance(ax, PeriodIndex) or kind == 'period':
- return PeriodIndexResampler(obj,
- groupby=self,
- kind=kind,
- axis=self.axis)
+ return DatetimeIndexResampler(obj, groupby=self, kind=kind, axis=self.axis)
+ elif isinstance(ax, PeriodIndex) or kind == "period":
+ return PeriodIndexResampler(obj, groupby=self, kind=kind, axis=self.axis)
elif isinstance(ax, TimedeltaIndex):
- return TimedeltaIndexResampler(obj,
- groupby=self,
- axis=self.axis)
+ return TimedeltaIndexResampler(obj, groupby=self, axis=self.axis)
- raise TypeError("Only valid with DatetimeIndex, "
- "TimedeltaIndex or PeriodIndex, "
- "but got an instance of %r" % type(ax).__name__)
+ raise TypeError(
+ "Only valid with DatetimeIndex, "
+ "TimedeltaIndex or PeriodIndex, "
+ "but got an instance of %r" % type(ax).__name__
+ )
def _get_grouper(self, obj, validate=True):
# create the resampler and return our binner
@@ -1401,43 +1451,46 @@ def _get_grouper(self, obj, validate=True):
def _get_time_bins(self, ax):
if not isinstance(ax, DatetimeIndex):
- raise TypeError('axis must be a DatetimeIndex, but got '
- 'an instance of %r' % type(ax).__name__)
+ raise TypeError(
+ "axis must be a DatetimeIndex, but got "
+ "an instance of %r" % type(ax).__name__
+ )
if len(ax) == 0:
- binner = labels = DatetimeIndex(
- data=[], freq=self.freq, name=ax.name)
+ binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name)
return binner, [], labels
- first, last = _get_timestamp_range_edges(ax.min(), ax.max(),
- self.freq,
- closed=self.closed,
- base=self.base)
+ first, last = _get_timestamp_range_edges(
+ ax.min(), ax.max(), self.freq, closed=self.closed, base=self.base
+ )
# GH #12037
# use first/last directly instead of call replace() on them
# because replace() will swallow the nanosecond part
# thus last bin maybe slightly before the end if the end contains
# nanosecond part and lead to `Values falls after last bin` error
- binner = labels = date_range(freq=self.freq,
- start=first,
- end=last,
- tz=ax.tz,
- name=ax.name,
- ambiguous='infer',
- nonexistent='shift_forward')
+ binner = labels = date_range(
+ freq=self.freq,
+ start=first,
+ end=last,
+ tz=ax.tz,
+ name=ax.name,
+ ambiguous="infer",
+ nonexistent="shift_forward",
+ )
ax_values = ax.asi8
binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
# general version, knowing nothing about relative frequencies
bins = lib.generate_bins_dt64(
- ax_values, bin_edges, self.closed, hasnans=ax.hasnans)
+ ax_values, bin_edges, self.closed, hasnans=ax.hasnans
+ )
- if self.closed == 'right':
+ if self.closed == "right":
labels = binner
- if self.label == 'right':
+ if self.label == "right":
labels = labels[1:]
- elif self.label == 'right':
+ elif self.label == "right":
labels = labels[1:]
if ax.hasnans:
@@ -1448,15 +1501,15 @@ def _get_time_bins(self, ax):
# adjust the labels
# GH4076
if len(bins) < len(labels):
- labels = labels[:len(bins)]
+ labels = labels[: len(bins)]
return binner, bins, labels
def _adjust_bin_edges(self, binner, ax_values):
# Some hacks for > daily data, see #1471, #1458, #1483
- if self.freq != 'D' and is_superperiod(self.freq, 'D'):
- if self.closed == 'right':
+ if self.freq != "D" and is_superperiod(self.freq, "D"):
+ if self.closed == "right":
# GH 21459, GH 9119: Adjust the bins relative to the wall time
bin_edges = binner.tz_localize(None)
bin_edges = bin_edges + timedelta(1) - Nano(1)
@@ -1474,22 +1527,22 @@ def _adjust_bin_edges(self, binner, ax_values):
def _get_time_delta_bins(self, ax):
if not isinstance(ax, TimedeltaIndex):
- raise TypeError('axis must be a TimedeltaIndex, but got '
- 'an instance of %r' % type(ax).__name__)
+ raise TypeError(
+ "axis must be a TimedeltaIndex, but got "
+ "an instance of %r" % type(ax).__name__
+ )
if not len(ax):
- binner = labels = TimedeltaIndex(
- data=[], freq=self.freq, name=ax.name)
+ binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name)
return binner, [], labels
start, end = ax.min(), ax.max()
- labels = binner = timedelta_range(start=start,
- end=end,
- freq=self.freq,
- name=ax.name)
+ labels = binner = timedelta_range(
+ start=start, end=end, freq=self.freq, name=ax.name
+ )
end_stamps = labels + self.freq
- bins = ax.searchsorted(end_stamps, side='left')
+ bins = ax.searchsorted(end_stamps, side="left")
# Addresses GH #10530
if self.base > 0:
@@ -1499,8 +1552,10 @@ def _get_time_delta_bins(self, ax):
def _get_time_period_bins(self, ax):
if not isinstance(ax, DatetimeIndex):
- raise TypeError('axis must be a DatetimeIndex, but got '
- 'an instance of %r' % type(ax).__name__)
+ raise TypeError(
+ "axis must be a DatetimeIndex, but got "
+ "an instance of %r" % type(ax).__name__
+ )
freq = self.freq
@@ -1508,22 +1563,23 @@ def _get_time_period_bins(self, ax):
binner = labels = PeriodIndex(data=[], freq=freq, name=ax.name)
return binner, [], labels
- labels = binner = pd.period_range(start=ax[0],
- end=ax[-1],
- freq=freq,
- name=ax.name)
+ labels = binner = pd.period_range(
+ start=ax[0], end=ax[-1], freq=freq, name=ax.name
+ )
- end_stamps = (labels + freq).asfreq(freq, 's').to_timestamp()
+ end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp()
if ax.tzinfo:
end_stamps = end_stamps.tz_localize(ax.tzinfo)
- bins = ax.searchsorted(end_stamps, side='left')
+ bins = ax.searchsorted(end_stamps, side="left")
return binner, bins, labels
def _get_period_bins(self, ax):
if not isinstance(ax, PeriodIndex):
- raise TypeError('axis must be a PeriodIndex, but got '
- 'an instance of %r' % type(ax).__name__)
+ raise TypeError(
+ "axis must be a PeriodIndex, but got "
+ "an instance of %r" % type(ax).__name__
+ )
memb = ax.asfreq(self.freq, how=self.convention)
@@ -1535,33 +1591,30 @@ def _get_period_bins(self, ax):
# if index contains no valid (non-NaT) values, return empty index
if not len(memb):
- binner = labels = PeriodIndex(
- data=[], freq=self.freq, name=ax.name)
+ binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name)
return binner, [], labels
freq_mult = self.freq.n
start = ax.min().asfreq(self.freq, how=self.convention)
- end = ax.max().asfreq(self.freq, how='end')
+ end = ax.max().asfreq(self.freq, how="end")
bin_shift = 0
# GH 23882
if self.base:
# get base adjusted bin edge labels
- p_start, end = _get_period_range_edges(start,
- end,
- self.freq,
- closed=self.closed,
- base=self.base)
+ p_start, end = _get_period_range_edges(
+ start, end, self.freq, closed=self.closed, base=self.base
+ )
# Get offset for bin edge (not label edge) adjustment
- start_offset = (pd.Period(start, self.freq)
- - pd.Period(p_start, self.freq))
+ start_offset = pd.Period(start, self.freq) - pd.Period(p_start, self.freq)
bin_shift = start_offset.n % freq_mult
start = p_start
- labels = binner = pd.period_range(start=start, end=end,
- freq=self.freq, name=ax.name)
+ labels = binner = pd.period_range(
+ start=start, end=end, freq=self.freq, name=ax.name
+ )
i8 = memb.asi8
@@ -1572,7 +1625,7 @@ def _get_period_bins(self, ax):
rng += freq_mult
# adjust bin edge indexes to account for base
rng -= bin_shift
- bins = memb.searchsorted(rng, side='left')
+ bins = memb.searchsorted(rng, side="left")
if nat_count > 0:
# NaT handling as in pandas._lib.lib.generate_bins_dt64()
@@ -1594,13 +1647,14 @@ def _take_new_index(obj, indexer, new_index, axis=0):
elif isinstance(obj, DataFrame):
if axis == 1:
raise NotImplementedError("axis 1 is not supported")
- return DataFrame(obj._data.reindex_indexer(
- new_axis=new_index, indexer=indexer, axis=1))
+ return DataFrame(
+ obj._data.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1)
+ )
else:
raise ValueError("'obj' should be either a Series or a DataFrame")
-def _get_timestamp_range_edges(first, last, offset, closed='left', base=0):
+def _get_timestamp_range_edges(first, last, offset, closed="left", base=0):
"""
Adjust the `first` Timestamp to the preceding Timestamp that resides on
the provided offset. Adjust the `last` Timestamp to the following
@@ -1634,8 +1688,9 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0):
first = first.tz_localize(None)
last = last.tz_localize(None)
- first, last = _adjust_dates_anchored(first, last, offset,
- closed=closed, base=base)
+ first, last = _adjust_dates_anchored(
+ first, last, offset, closed=closed, base=base
+ )
if isinstance(offset, Day):
first = first.tz_localize(tz)
last = last.tz_localize(tz)
@@ -1645,7 +1700,7 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0):
first = first.normalize()
last = last.normalize()
- if closed == 'left':
+ if closed == "left":
first = Timestamp(offset.rollback(first))
else:
first = Timestamp(first - offset)
@@ -1655,7 +1710,7 @@ def _get_timestamp_range_edges(first, last, offset, closed='left', base=0):
return first, last
-def _get_period_range_edges(first, last, offset, closed='left', base=0):
+def _get_period_range_edges(first, last, offset, closed="left", base=0):
"""
Adjust the provided `first` and `last` Periods to the respective Period of
the given offset that encompasses them.
@@ -1686,15 +1741,16 @@ def _get_period_range_edges(first, last, offset, closed='left', base=0):
adjust_first = not offset.onOffset(first)
adjust_last = offset.onOffset(last)
- first, last = _get_timestamp_range_edges(first, last, offset,
- closed=closed, base=base)
+ first, last = _get_timestamp_range_edges(
+ first, last, offset, closed=closed, base=base
+ )
first = (first + adjust_first * offset).to_period(offset)
last = (last - adjust_last * offset).to_period(offset)
return first, last
-def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
+def _adjust_dates_anchored(first, last, offset, closed="right", base=0):
# First and last offsets should be calculated from the start day to fix an
# error cause by resampling across multiple days when a one day period is
# not a multiple of the frequency.
@@ -1708,9 +1764,9 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
last_tzinfo = last.tzinfo
start_day_nanos = first.normalize().value
if first_tzinfo is not None:
- first = first.tz_convert('UTC')
+ first = first.tz_convert("UTC")
if last_tzinfo is not None:
- last = last.tz_convert('UTC')
+ last = last.tz_convert("UTC")
base_nanos = (base % offset.n) * offset.nanos // offset.n
start_day_nanos += base_nanos
@@ -1718,7 +1774,7 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
foffset = (first.value - start_day_nanos) % offset.nanos
loffset = (last.value - start_day_nanos) % offset.nanos
- if closed == 'right':
+ if closed == "right":
if foffset > 0:
# roll back
fresult = first.value - foffset
@@ -1746,9 +1802,9 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0):
fresult = Timestamp(fresult)
lresult = Timestamp(lresult)
if first_tzinfo is not None:
- fresult = fresult.tz_localize('UTC').tz_convert(first_tzinfo)
+ fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo)
if last_tzinfo is not None:
- lresult = lresult.tz_localize('UTC').tz_convert(last_tzinfo)
+ lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo)
return fresult, lresult
@@ -1761,7 +1817,7 @@ def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None):
raise NotImplementedError("'method' argument is not supported")
if how is None:
- how = 'E'
+ how = "E"
new_obj = obj.copy()
new_obj.index = obj.index.asfreq(freq, how=how)
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index d4272cf6e406d..5a476dceca1f3 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -11,11 +11,16 @@
from pandas import DataFrame, Index, MultiIndex, Series
from pandas.core import common as com
from pandas.core.arrays.categorical import (
- _factorize_from_iterable, _factorize_from_iterables)
+ _factorize_from_iterable,
+ _factorize_from_iterables,
+)
from pandas.core.generic import NDFrame
from pandas.core.index import (
- _all_indexes_same, _get_consensus_names, _get_objs_combined_axis,
- ensure_index)
+ _all_indexes_same,
+ _get_consensus_names,
+ _get_objs_combined_axis,
+ ensure_index,
+)
import pandas.core.indexes.base as ibase
from pandas.core.internals import concatenate_block_managers
@@ -23,9 +28,19 @@
# Concatenate DataFrame objects
-def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
- keys=None, levels=None, names=None, verify_integrity=False,
- sort=None, copy=True):
+def concat(
+ objs,
+ axis=0,
+ join="outer",
+ join_axes=None,
+ ignore_index=False,
+ keys=None,
+ levels=None,
+ names=None,
+ verify_integrity=False,
+ sort=None,
+ copy=True,
+):
"""
Concatenate pandas objects along a particular axis with optional set logic
along the other axes.
@@ -226,10 +241,19 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
...
ValueError: Indexes have overlapping values: ['a']
"""
- op = _Concatenator(objs, axis=axis, ignore_index=ignore_index, join=join,
- join_axes=join_axes, keys=keys, levels=levels,
- names=names, verify_integrity=verify_integrity,
- copy=copy, sort=sort)
+ op = _Concatenator(
+ objs,
+ axis=axis,
+ ignore_index=ignore_index,
+ join=join,
+ join_axes=join_axes,
+ keys=keys,
+ levels=levels,
+ names=names,
+ verify_integrity=verify_integrity,
+ copy=copy,
+ sort=sort,
+ )
return op.get_result()
@@ -239,21 +263,35 @@ class _Concatenator:
Orchestrates a concatenation operation for BlockManagers
"""
- def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None,
- levels=None, names=None, ignore_index=False,
- verify_integrity=False, copy=True, sort=False):
+ def __init__(
+ self,
+ objs,
+ axis=0,
+ join="outer",
+ join_axes=None,
+ keys=None,
+ levels=None,
+ names=None,
+ ignore_index=False,
+ verify_integrity=False,
+ copy=True,
+ sort=False,
+ ):
if isinstance(objs, (NDFrame, str)):
- raise TypeError('first argument must be an iterable of pandas '
- 'objects, you passed an object of type '
- '"{name}"'.format(name=type(objs).__name__))
+ raise TypeError(
+ "first argument must be an iterable of pandas "
+ "objects, you passed an object of type "
+ '"{name}"'.format(name=type(objs).__name__)
+ )
- if join == 'outer':
+ if join == "outer":
self.intersect = False
- elif join == 'inner':
+ elif join == "inner":
self.intersect = True
else: # pragma: no cover
- raise ValueError('Only can inner (intersect) or outer (union) '
- 'join the other axis')
+ raise ValueError(
+ "Only can inner (intersect) or outer (union) " "join the other axis"
+ )
if isinstance(objs, dict):
if keys is None:
@@ -263,7 +301,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None,
objs = list(objs)
if len(objs) == 0:
- raise ValueError('No objects to concatenate')
+ raise ValueError("No objects to concatenate")
if keys is None:
objs = list(com._not_none(*objs))
@@ -277,19 +315,20 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None,
clean_keys.append(k)
clean_objs.append(v)
objs = clean_objs
- name = getattr(keys, 'name', None)
+ name = getattr(keys, "name", None)
keys = Index(clean_keys, name=name)
if len(objs) == 0:
- raise ValueError('All objects passed were None')
+ raise ValueError("All objects passed were None")
# consolidate data & figure out what our result ndim is going to be
ndims = set()
for obj in objs:
if not isinstance(obj, (Series, DataFrame)):
- msg = ("cannot concatenate object of type '{}';"
- ' only Series and DataFrame objs are valid'
- .format(type(obj)))
+ msg = (
+ "cannot concatenate object of type '{}';"
+ " only Series and DataFrame objs are valid".format(type(obj))
+ )
raise TypeError(msg)
# consolidate
@@ -310,11 +349,13 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None,
else:
# filter out the empties if we have not multi-index possibilities
# note to keep empty Series as it affect to result columns / name
- non_empties = [obj for obj in objs
- if sum(obj.shape) > 0 or isinstance(obj, Series)]
+ non_empties = [
+ obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, Series)
+ ]
- if (len(non_empties) and (keys is None and names is None and
- levels is None and not self.intersect)):
+ if len(non_empties) and (
+ keys is None and names is None and levels is None and not self.intersect
+ ):
objs = non_empties
sample = objs[0]
@@ -335,8 +376,10 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None,
self._is_series = isinstance(sample, Series)
if not 0 <= axis <= sample.ndim:
- raise AssertionError("axis must be between 0 and {ndim}, input was"
- " {axis}".format(ndim=sample.ndim, axis=axis))
+ raise AssertionError(
+ "axis must be between 0 and {ndim}, input was"
+ " {axis}".format(ndim=sample.ndim, axis=axis)
+ )
# if we have mixed ndims, then convert to highest ndim
# creating column numbers as needed
@@ -351,11 +394,13 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None,
pass
elif ndim != max_ndim - 1:
- raise ValueError("cannot concatenate unaligned mixed "
- "dimensional NDFrame objects")
+ raise ValueError(
+ "cannot concatenate unaligned mixed "
+ "dimensional NDFrame objects"
+ )
else:
- name = getattr(obj, 'name', None)
+ name = getattr(obj, "name", None)
if ignore_index or name is None:
name = current_column
current_column += 1
@@ -372,7 +417,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None,
self.axis = axis
self.join_axes = join_axes
self.keys = keys
- self.names = names or getattr(keys, 'names', None)
+ self.names = names or getattr(keys, "names", None)
self.levels = levels
self.sort = sort
@@ -391,10 +436,11 @@ def get_result(self):
if self.axis == 0:
name = com.consensus_name_attr(self.objs)
- mgr = self.objs[0]._data.concat([x._data for x in self.objs],
- self.new_axes)
+ mgr = self.objs[0]._data.concat(
+ [x._data for x in self.objs], self.new_axes
+ )
cons = _concat._get_series_result_type(mgr, self.objs)
- return cons(mgr, name=name).__finalize__(self, method='concat')
+ return cons(mgr, name=name).__finalize__(self, method="concat")
# combine as columns in a frame
else:
@@ -404,7 +450,7 @@ def get_result(self):
index, columns = self.new_axes
df = cons(data, index=index)
df.columns = columns
- return df.__finalize__(self, method='concat')
+ return df.__finalize__(self, method="concat")
# combine block managers
else:
@@ -424,14 +470,15 @@ def get_result(self):
mgrs_indexers.append((obj._data, indexers))
new_data = concatenate_block_managers(
- mgrs_indexers, self.new_axes, concat_axis=self.axis,
- copy=self.copy)
+ mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy
+ )
if not self.copy:
new_data._consolidate_inplace()
cons = _concat._get_frame_result_type(new_data, self.objs)
- return (cons._from_axes(new_data, self.new_axes)
- .__finalize__(self, method='concat'))
+ return cons._from_axes(new_data, self.new_axes).__finalize__(
+ self, method="concat"
+ )
def _get_result_dim(self):
if self._is_series and self.axis == 1:
@@ -452,13 +499,18 @@ def _get_new_axes(self):
else:
# GH 21951
warnings.warn(
- 'The join_axes-keyword is deprecated. Use .reindex or '
- '.reindex_like on the result to achieve the same '
- 'functionality.', FutureWarning, stacklevel=4)
+ "The join_axes-keyword is deprecated. Use .reindex or "
+ ".reindex_like on the result to achieve the same "
+ "functionality.",
+ FutureWarning,
+ stacklevel=4,
+ )
if len(self.join_axes) != ndim - 1:
- raise AssertionError("length of join_axes must be equal "
- "to {length}".format(length=ndim - 1))
+ raise AssertionError(
+ "length of join_axes must be equal "
+ "to {length}".format(length=ndim - 1)
+ )
# ufff...
indices = list(range(ndim))
@@ -473,13 +525,12 @@ def _get_new_axes(self):
def _get_comb_axis(self, i):
data_axis = self.objs[0]._get_block_manager_axis(i)
try:
- return _get_objs_combined_axis(self.objs, axis=data_axis,
- intersect=self.intersect,
- sort=self.sort)
+ return _get_objs_combined_axis(
+ self.objs, axis=data_axis, intersect=self.intersect, sort=self.sort
+ )
except IndexError:
types = [type(x).__name__ for x in self.objs]
- raise TypeError("Cannot concatenate list of {types}"
- .format(types=types))
+ raise TypeError("Cannot concatenate list of {types}".format(types=types))
def _get_concat_axis(self):
"""
@@ -497,9 +548,10 @@ def _get_concat_axis(self):
has_names = False
for i, x in enumerate(self.objs):
if not isinstance(x, Series):
- raise TypeError("Cannot concatenate type 'Series' "
- "with object of type {type!r}"
- .format(type=type(x).__name__))
+ raise TypeError(
+ "Cannot concatenate type 'Series' "
+ "with object of type {type!r}".format(type=type(x).__name__)
+ )
if x.name is not None:
names[i] = x.name
has_names = True
@@ -522,8 +574,9 @@ def _get_concat_axis(self):
if self.keys is None:
concat_axis = _concat_indexes(indexes)
else:
- concat_axis = _make_concat_multiindex(indexes, self.keys,
- self.levels, self.names)
+ concat_axis = _make_concat_multiindex(
+ indexes, self.keys, self.levels, self.names
+ )
self._maybe_check_integrity(concat_axis)
@@ -533,8 +586,10 @@ def _maybe_check_integrity(self, concat_index):
if self.verify_integrity:
if not concat_index.is_unique:
overlap = concat_index[concat_index.duplicated()].unique()
- raise ValueError('Indexes have overlapping values: '
- '{overlap!s}'.format(overlap=overlap))
+ raise ValueError(
+ "Indexes have overlapping values: "
+ "{overlap!s}".format(overlap=overlap)
+ )
def _concat_indexes(indexes):
@@ -543,8 +598,9 @@ def _concat_indexes(indexes):
def _make_concat_multiindex(indexes, keys, levels=None, names=None):
- if ((levels is None and isinstance(keys[0], tuple)) or
- (levels is not None and len(levels) > 1)):
+ if (levels is None and isinstance(keys[0], tuple)) or (
+ levels is not None and len(levels) > 1
+ ):
zipped = list(zip(*keys))
if names is None:
names = [None] * len(zipped)
@@ -575,8 +631,11 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None):
try:
i = level.get_loc(key)
except KeyError:
- raise ValueError('Key {key!s} not in level {level!s}'
- .format(key=key, level=level))
+ raise ValueError(
+ "Key {key!s} not in level {level!s}".format(
+ key=key, level=level
+ )
+ )
to_concat.append(np.repeat(i, len(index)))
codes_list.append(np.concatenate(to_concat))
@@ -597,14 +656,17 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None):
else:
# make sure that all of the passed indices have the same nlevels
if not len({idx.nlevels for idx in indexes}) == 1:
- raise AssertionError("Cannot concat indices that do"
- " not have the same number of levels")
+ raise AssertionError(
+ "Cannot concat indices that do"
+ " not have the same number of levels"
+ )
# also copies
names = names + _get_consensus_names(indexes)
- return MultiIndex(levels=levels, codes=codes_list, names=names,
- verify_integrity=False)
+ return MultiIndex(
+ levels=levels, codes=codes_list, names=names, verify_integrity=False
+ )
new_index = indexes[0]
n = len(new_index)
@@ -625,8 +687,11 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None):
mask = mapped == -1
if mask.any():
- raise ValueError('Values not found in passed level: {hlevel!s}'
- .format(hlevel=hlevel[mask]))
+ raise ValueError(
+ "Values not found in passed level: {hlevel!s}".format(
+ hlevel=hlevel[mask]
+ )
+ )
new_codes.append(np.repeat(mapped, n))
@@ -640,5 +705,6 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None):
if len(new_names) < len(new_levels):
new_names.extend(new_index.names)
- return MultiIndex(levels=new_levels, codes=new_codes, names=new_names,
- verify_integrity=False)
+ return MultiIndex(
+ levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
+ )
diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py
index d655a8be13de7..9a69942a70e01 100644
--- a/pandas/core/reshape/melt.py
+++ b/pandas/core/reshape/melt.py
@@ -15,12 +15,18 @@
from pandas.core.tools.numeric import to_numeric
-@Appender(_shared_docs['melt'] %
- dict(caller='pd.melt(df, ',
- versionadded="",
- other='DataFrame.melt'))
-def melt(frame, id_vars=None, value_vars=None, var_name=None,
- value_name='value', col_level=None):
+@Appender(
+ _shared_docs["melt"]
+ % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt")
+)
+def melt(
+ frame,
+ id_vars=None,
+ value_vars=None,
+ var_name=None,
+ value_name="value",
+ col_level=None,
+):
# TODO: what about the existing index?
# If multiindex, gather names of columns on all level for checking presence
# of `id_vars` and `value_vars`
@@ -31,36 +37,42 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None,
if id_vars is not None:
if not is_list_like(id_vars):
id_vars = [id_vars]
- elif (isinstance(frame.columns, ABCMultiIndex) and
- not isinstance(id_vars, list)):
- raise ValueError('id_vars must be a list of tuples when columns'
- ' are a MultiIndex')
+ elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(id_vars, list):
+ raise ValueError(
+ "id_vars must be a list of tuples when columns" " are a MultiIndex"
+ )
else:
# Check that `id_vars` are in frame
id_vars = list(id_vars)
missing = Index(np.ravel(id_vars)).difference(cols)
if not missing.empty:
- raise KeyError("The following 'id_vars' are not present"
- " in the DataFrame: {missing}"
- "".format(missing=list(missing)))
+ raise KeyError(
+ "The following 'id_vars' are not present"
+ " in the DataFrame: {missing}"
+ "".format(missing=list(missing))
+ )
else:
id_vars = []
if value_vars is not None:
if not is_list_like(value_vars):
value_vars = [value_vars]
- elif (isinstance(frame.columns, ABCMultiIndex) and
- not isinstance(value_vars, list)):
- raise ValueError('value_vars must be a list of tuples when'
- ' columns are a MultiIndex')
+ elif isinstance(frame.columns, ABCMultiIndex) and not isinstance(
+ value_vars, list
+ ):
+ raise ValueError(
+ "value_vars must be a list of tuples when" " columns are a MultiIndex"
+ )
else:
value_vars = list(value_vars)
# Check that `value_vars` are in frame
missing = Index(np.ravel(value_vars)).difference(cols)
if not missing.empty:
- raise KeyError("The following 'value_vars' are not present in"
- " the DataFrame: {missing}"
- "".format(missing=list(missing)))
+ raise KeyError(
+ "The following 'value_vars' are not present in"
+ " the DataFrame: {missing}"
+ "".format(missing=list(missing))
+ )
frame = frame.loc[:, id_vars + value_vars]
else:
frame = frame.copy()
@@ -74,11 +86,13 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None,
if len(frame.columns.names) == len(set(frame.columns.names)):
var_name = frame.columns.names
else:
- var_name = ['variable_{i}'.format(i=i)
- for i in range(len(frame.columns.names))]
+ var_name = [
+ "variable_{i}".format(i=i) for i in range(len(frame.columns.names))
+ ]
else:
- var_name = [frame.columns.name if frame.columns.name is not None
- else 'variable']
+ var_name = [
+ frame.columns.name if frame.columns.name is not None else "variable"
+ ]
if isinstance(var_name, str):
var_name = [var_name]
@@ -96,11 +110,10 @@ def melt(frame, id_vars=None, value_vars=None, var_name=None,
mcolumns = id_vars + var_name + [value_name]
- mdata[value_name] = frame.values.ravel('F')
+ mdata[value_name] = frame.values.ravel("F")
for i, col in enumerate(var_name):
# asanyarray will keep the columns as an Index
- mdata[col] = np.asanyarray(frame.columns
- ._get_level_values(i)).repeat(N)
+ mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N)
return frame._constructor(mdata, columns=mcolumns)
@@ -150,7 +163,7 @@ def lreshape(data, groups, dropna=True, label=None):
for seq in values:
if len(seq) != K:
- raise ValueError('All column lists must be same length')
+ raise ValueError("All column lists must be same length")
mdata = {}
pivot_cols = []
@@ -159,6 +172,7 @@ def lreshape(data, groups, dropna=True, label=None):
to_concat = [data[col].values for col in names]
import pandas.core.dtypes.concat as _concat
+
mdata[target] = _concat._concat_compat(to_concat)
pivot_cols.append(target)
@@ -175,7 +189,7 @@ def lreshape(data, groups, dropna=True, label=None):
return data._constructor(mdata, columns=id_cols + pivot_cols)
-def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
+def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"):
r"""
Wide panel to long format. Less flexible but more user-friendly than melt.
@@ -403,20 +417,27 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
3 one 2.1
two 2.9
"""
+
def get_var_names(df, stub, sep, suffix):
- regex = r'^{stub}{sep}{suffix}$'.format(
- stub=re.escape(stub), sep=re.escape(sep), suffix=suffix)
+ regex = r"^{stub}{sep}{suffix}$".format(
+ stub=re.escape(stub), sep=re.escape(sep), suffix=suffix
+ )
pattern = re.compile(regex)
return [col for col in df.columns if pattern.match(col)]
def melt_stub(df, stub, i, j, value_vars, sep):
- newdf = melt(df, id_vars=i, value_vars=value_vars,
- value_name=stub.rstrip(sep), var_name=j)
+ newdf = melt(
+ df,
+ id_vars=i,
+ value_vars=value_vars,
+ value_name=stub.rstrip(sep),
+ var_name=j,
+ )
newdf[j] = Categorical(newdf[j])
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
# GH17627 Cast numerics suffixes to int/float
- newdf[j] = to_numeric(newdf[j], errors='ignore')
+ newdf[j] = to_numeric(newdf[j], errors="ignore")
return newdf.set_index(i + [j])
@@ -441,9 +462,8 @@ def melt_stub(df, stub, i, j, value_vars, sep):
value_vars_flattened = [e for sublist in value_vars for e in sublist]
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
- melted = [melt_stub(df, s, i, j, v, sep)
- for s, v in zip(stubnames, value_vars)]
- melted = melted[0].join(melted[1:], how='outer')
+ melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)]
+ melted = melted[0].join(melted[1:], how="outer")
if len(i) == 1:
new = df[id_vars].set_index(i).join(melted)
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 549c69486ebfa..4f910f6a278ad 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -15,12 +15,28 @@
from pandas.util._decorators import Appender, Substitution
from pandas.core.dtypes.common import (
- ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool,
- is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
- is_datetime64tz_dtype, is_datetimelike, is_dtype_equal,
- is_extension_array_dtype, is_float_dtype, is_int64_dtype, is_integer,
- is_integer_dtype, is_list_like, is_number, is_numeric_dtype,
- is_object_dtype, needs_i8_conversion)
+ ensure_float64,
+ ensure_int64,
+ ensure_object,
+ is_array_like,
+ is_bool,
+ is_bool_dtype,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_datetimelike,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_float_dtype,
+ is_int64_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_list_like,
+ is_number,
+ is_numeric_dtype,
+ is_object_dtype,
+ needs_i8_conversion,
+)
from pandas.core.dtypes.missing import isnull, na_value_for_dtype
from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta
@@ -33,26 +49,46 @@
from pandas.core.sorting import is_int64_overflow_possible
-@Substitution('\nleft : DataFrame')
+@Substitution("\nleft : DataFrame")
@Appender(_merge_doc, indents=0)
-def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
- left_index=False, right_index=False, sort=False,
- suffixes=('_x', '_y'), copy=True, indicator=False,
- validate=None):
- op = _MergeOperation(left, right, how=how, on=on, left_on=left_on,
- right_on=right_on, left_index=left_index,
- right_index=right_index, sort=sort, suffixes=suffixes,
- copy=copy, indicator=indicator,
- validate=validate)
+def merge(
+ left,
+ right,
+ how="inner",
+ on=None,
+ left_on=None,
+ right_on=None,
+ left_index=False,
+ right_index=False,
+ sort=False,
+ suffixes=("_x", "_y"),
+ copy=True,
+ indicator=False,
+ validate=None,
+):
+ op = _MergeOperation(
+ left,
+ right,
+ how=how,
+ on=on,
+ left_on=left_on,
+ right_on=right_on,
+ left_index=left_index,
+ right_index=right_index,
+ sort=sort,
+ suffixes=suffixes,
+ copy=copy,
+ indicator=indicator,
+ validate=validate,
+ )
return op.get_result()
if __debug__:
- merge.__doc__ = _merge_doc % '\nleft : DataFrame'
+ merge.__doc__ = _merge_doc % "\nleft : DataFrame"
-def _groupby_and_merge(by, on, left, right, _merge_pieces,
- check_duplicates=True):
+def _groupby_and_merge(by, on, left, right, _merge_pieces, check_duplicates=True):
"""
groupby & merge; we are always performing a left-by type operation
@@ -85,7 +121,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces,
on = [on]
if right.duplicated(by + on).any():
- right = right.drop_duplicates(by + on, keep='last')
+ right = right.drop_duplicates(by + on, keep="last")
rby = right.groupby(by, sort=False)
except KeyError:
rby = None
@@ -100,8 +136,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces,
except KeyError:
# key doesn't exist in left
lcols = lhs.columns.tolist()
- cols = lcols + [r for r in right.columns
- if r not in set(lcols)]
+ cols = lcols + [r for r in right.columns if r not in set(lcols)]
merged = lhs.reindex(columns=cols)
merged.index = range(len(merged))
pieces.append(merged)
@@ -123,16 +158,24 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces,
# preserve the original order
# if we have a missing piece this can be reset
from pandas.core.reshape.concat import concat
+
result = concat(pieces, ignore_index=True)
result = result.reindex(columns=pieces[0].columns, copy=False)
return result, lby
-def merge_ordered(left, right, on=None,
- left_on=None, right_on=None,
- left_by=None, right_by=None,
- fill_method=None, suffixes=('_x', '_y'),
- how='outer'):
+def merge_ordered(
+ left,
+ right,
+ on=None,
+ left_on=None,
+ right_on=None,
+ left_by=None,
+ right_by=None,
+ fill_method=None,
+ suffixes=("_x", "_y"),
+ how="outer",
+):
"""
Perform merge with optional filling/interpolation designed for ordered
data like time series data. Optionally perform group-wise merge (see
@@ -211,36 +254,57 @@ def merge_ordered(left, right, on=None,
8 b d 2 3.0
9 b e 3 3.0
"""
+
def _merger(x, y):
# perform the ordered merge operation
- op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on,
- suffixes=suffixes, fill_method=fill_method,
- how=how)
+ op = _OrderedMerge(
+ x,
+ y,
+ on=on,
+ left_on=left_on,
+ right_on=right_on,
+ suffixes=suffixes,
+ fill_method=fill_method,
+ how=how,
+ )
return op.get_result()
if left_by is not None and right_by is not None:
- raise ValueError('Can only group either left or right frames')
+ raise ValueError("Can only group either left or right frames")
elif left_by is not None:
- result, _ = _groupby_and_merge(left_by, on, left, right,
- lambda x, y: _merger(x, y),
- check_duplicates=False)
+ result, _ = _groupby_and_merge(
+ left_by, on, left, right, lambda x, y: _merger(x, y), check_duplicates=False
+ )
elif right_by is not None:
- result, _ = _groupby_and_merge(right_by, on, right, left,
- lambda x, y: _merger(y, x),
- check_duplicates=False)
+ result, _ = _groupby_and_merge(
+ right_by,
+ on,
+ right,
+ left,
+ lambda x, y: _merger(y, x),
+ check_duplicates=False,
+ )
else:
result = _merger(left, right)
return result
-def merge_asof(left, right, on=None,
- left_on=None, right_on=None,
- left_index=False, right_index=False,
- by=None, left_by=None, right_by=None,
- suffixes=('_x', '_y'),
- tolerance=None,
- allow_exact_matches=True,
- direction='backward'):
+def merge_asof(
+ left,
+ right,
+ on=None,
+ left_on=None,
+ right_on=None,
+ left_index=False,
+ right_index=False,
+ by=None,
+ left_by=None,
+ right_by=None,
+ suffixes=("_x", "_y"),
+ tolerance=None,
+ allow_exact_matches=True,
+ direction="backward",
+):
"""
Perform an asof merge. This is similar to a left-join except that we
match on nearest key rather than equal keys.
@@ -458,14 +522,23 @@ def merge_asof(left, right, on=None,
3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN
4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
"""
- op = _AsOfMerge(left, right,
- on=on, left_on=left_on, right_on=right_on,
- left_index=left_index, right_index=right_index,
- by=by, left_by=left_by, right_by=right_by,
- suffixes=suffixes,
- how='asof', tolerance=tolerance,
- allow_exact_matches=allow_exact_matches,
- direction=direction)
+ op = _AsOfMerge(
+ left,
+ right,
+ on=on,
+ left_on=left_on,
+ right_on=right_on,
+ left_index=left_index,
+ right_index=right_index,
+ by=by,
+ left_by=left_by,
+ right_by=right_by,
+ suffixes=suffixes,
+ how="asof",
+ tolerance=tolerance,
+ allow_exact_matches=allow_exact_matches,
+ direction=direction,
+ )
return op.get_result()
@@ -476,13 +549,26 @@ class _MergeOperation:
Perform a database (SQL) merge operation between two DataFrame objects
using either columns as keys or their row indexes
"""
- _merge_type = 'merge'
- def __init__(self, left, right, how='inner', on=None,
- left_on=None, right_on=None, axis=1,
- left_index=False, right_index=False, sort=True,
- suffixes=('_x', '_y'), copy=True, indicator=False,
- validate=None):
+ _merge_type = "merge"
+
+ def __init__(
+ self,
+ left,
+ right,
+ how="inner",
+ on=None,
+ left_on=None,
+ right_on=None,
+ axis=1,
+ left_index=False,
+ right_index=False,
+ sort=True,
+ suffixes=("_x", "_y"),
+ copy=True,
+ indicator=False,
+ validate=None,
+ ):
left = validate_operand(left)
right = validate_operand(right)
self.left = self.orig_left = left
@@ -506,34 +592,39 @@ def __init__(self, left, right, how='inner', on=None,
if isinstance(self.indicator, str):
self.indicator_name = self.indicator
elif isinstance(self.indicator, bool):
- self.indicator_name = '_merge' if self.indicator else None
+ self.indicator_name = "_merge" if self.indicator else None
else:
raise ValueError(
- 'indicator option can only accept boolean or string arguments')
+ "indicator option can only accept boolean or string arguments"
+ )
if not is_bool(left_index):
raise ValueError(
- 'left_index parameter must be of type bool, not '
- '{left_index}'.format(left_index=type(left_index)))
+ "left_index parameter must be of type bool, not "
+ "{left_index}".format(left_index=type(left_index))
+ )
if not is_bool(right_index):
raise ValueError(
- 'right_index parameter must be of type bool, not '
- '{right_index}'.format(right_index=type(right_index)))
+ "right_index parameter must be of type bool, not "
+ "{right_index}".format(right_index=type(right_index))
+ )
# warn user when merging between different levels
if left.columns.nlevels != right.columns.nlevels:
- msg = ('merging between different levels can give an unintended '
- 'result ({left} levels on the left, {right} on the right)'
- ).format(left=left.columns.nlevels,
- right=right.columns.nlevels)
+ msg = (
+ "merging between different levels can give an unintended "
+ "result ({left} levels on the left, {right} on the right)"
+ ).format(left=left.columns.nlevels, right=right.columns.nlevels)
warnings.warn(msg, UserWarning)
self._validate_specification()
# note this function has side effects
- (self.left_join_keys,
- self.right_join_keys,
- self.join_names) = self._get_merge_keys()
+ (
+ self.left_join_keys,
+ self.right_join_keys,
+ self.join_names,
+ ) = self._get_merge_keys()
# validate the merge keys dtypes. We may need to coerce
# to avoid incompat dtypes
@@ -547,16 +638,16 @@ def __init__(self, left, right, how='inner', on=None,
def get_result(self):
if self.indicator:
- self.left, self.right = self._indicator_pre_merge(
- self.left, self.right)
+ self.left, self.right = self._indicator_pre_merge(self.left, self.right)
join_index, left_indexer, right_indexer = self._get_join_info()
ldata, rdata = self.left._data, self.right._data
lsuf, rsuf = self.suffixes
- llabels, rlabels = _items_overlap_with_suffix(ldata.items, lsuf,
- rdata.items, rsuf)
+ llabels, rlabels = _items_overlap_with_suffix(
+ ldata.items, lsuf, rdata.items, rsuf
+ )
lindexers = {1: left_indexer} if left_indexer is not None else {}
rindexers = {1: right_indexer} if right_indexer is not None else {}
@@ -564,7 +655,9 @@ def get_result(self):
result_data = concatenate_block_managers(
[(ldata, lindexers), (rdata, rindexers)],
axes=[llabels.append(rlabels), join_index],
- concat_axis=0, copy=self.copy)
+ concat_axis=0,
+ copy=self.copy,
+ )
typ = self.left._constructor
result = typ(result_data).__finalize__(self, method=self._merge_type)
@@ -582,40 +675,42 @@ def _indicator_pre_merge(self, left, right):
columns = left.columns.union(right.columns)
- for i in ['_left_indicator', '_right_indicator']:
+ for i in ["_left_indicator", "_right_indicator"]:
if i in columns:
- raise ValueError("Cannot use `indicator=True` option when "
- "data contains a column named {name}"
- .format(name=i))
+ raise ValueError(
+ "Cannot use `indicator=True` option when "
+ "data contains a column named {name}".format(name=i)
+ )
if self.indicator_name in columns:
raise ValueError(
- "Cannot use name of an existing column for indicator column")
+ "Cannot use name of an existing column for indicator column"
+ )
left = left.copy()
right = right.copy()
- left['_left_indicator'] = 1
- left['_left_indicator'] = left['_left_indicator'].astype('int8')
+ left["_left_indicator"] = 1
+ left["_left_indicator"] = left["_left_indicator"].astype("int8")
- right['_right_indicator'] = 2
- right['_right_indicator'] = right['_right_indicator'].astype('int8')
+ right["_right_indicator"] = 2
+ right["_right_indicator"] = right["_right_indicator"].astype("int8")
return left, right
def _indicator_post_merge(self, result):
- result['_left_indicator'] = result['_left_indicator'].fillna(0)
- result['_right_indicator'] = result['_right_indicator'].fillna(0)
+ result["_left_indicator"] = result["_left_indicator"].fillna(0)
+ result["_right_indicator"] = result["_right_indicator"].fillna(0)
- result[self.indicator_name] = Categorical((result['_left_indicator'] +
- result['_right_indicator']),
- categories=[1, 2, 3])
- result[self.indicator_name] = (
- result[self.indicator_name]
- .cat.rename_categories(['left_only', 'right_only', 'both']))
+ result[self.indicator_name] = Categorical(
+ (result["_left_indicator"] + result["_right_indicator"]),
+ categories=[1, 2, 3],
+ )
+ result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(
+ ["left_only", "right_only", "both"]
+ )
- result = result.drop(labels=['_left_indicator', '_right_indicator'],
- axis=1)
+ result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)
return result
def _maybe_restore_index_levels(self, result):
@@ -639,12 +734,14 @@ def _maybe_restore_index_levels(self, result):
None
"""
names_to_restore = []
- for name, left_key, right_key in zip(self.join_names,
- self.left_on,
- self.right_on):
- if (self.orig_left._is_level_reference(left_key) and
- self.orig_right._is_level_reference(right_key) and
- name not in result.index.names):
+ for name, left_key, right_key in zip(
+ self.join_names, self.left_on, self.right_on
+ ):
+ if (
+ self.orig_left._is_level_reference(left_key)
+ and self.orig_right._is_level_reference(right_key)
+ and name not in result.index.names
+ ):
names_to_restore.append(name)
@@ -674,8 +771,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
if left_has_missing:
take_right = self.right_join_keys[i]
- if not is_dtype_equal(result[name].dtype,
- self.left[name].dtype):
+ if not is_dtype_equal(
+ result[name].dtype, self.left[name].dtype
+ ):
take_left = self.left[name]._values
elif name in self.right:
@@ -686,12 +784,12 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
if right_has_missing:
take_left = self.left_join_keys[i]
- if not is_dtype_equal(result[name].dtype,
- self.right[name].dtype):
+ if not is_dtype_equal(
+ result[name].dtype, self.right[name].dtype
+ ):
take_right = self.right[name]._values
- elif left_indexer is not None \
- and is_array_like(self.left_join_keys[i]):
+ elif left_indexer is not None and is_array_like(self.left_join_keys[i]):
take_left = self.left_join_keys[i]
take_right = self.right_join_keys[i]
@@ -701,15 +799,13 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
lvals = result[name]._values
else:
lfill = na_value_for_dtype(take_left.dtype)
- lvals = algos.take_1d(take_left, left_indexer,
- fill_value=lfill)
+ lvals = algos.take_1d(take_left, left_indexer, fill_value=lfill)
if take_right is None:
rvals = result[name]._values
else:
rfill = na_value_for_dtype(take_right.dtype)
- rvals = algos.take_1d(take_right, right_indexer,
- fill_value=rfill)
+ rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill)
# if we have an all missing left_indexer
# make sure to just use the right values
@@ -724,61 +820,66 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
elif result._is_level_reference(name):
if isinstance(result.index, MultiIndex):
key_col.name = name
- idx_list = [result.index.get_level_values(level_name)
- if level_name != name else key_col
- for level_name in result.index.names]
+ idx_list = [
+ result.index.get_level_values(level_name)
+ if level_name != name
+ else key_col
+ for level_name in result.index.names
+ ]
result.set_index(idx_list, inplace=True)
else:
result.index = Index(key_col, name=name)
else:
- result.insert(i, name or 'key_{i}'.format(i=i), key_col)
+ result.insert(i, name or "key_{i}".format(i=i), key_col)
def _get_join_indexers(self):
""" return the join indexers """
- return _get_join_indexers(self.left_join_keys,
- self.right_join_keys,
- sort=self.sort,
- how=self.how)
+ return _get_join_indexers(
+ self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
+ )
def _get_join_info(self):
left_ax = self.left._data.axes[self.axis]
right_ax = self.right._data.axes[self.axis]
- if self.left_index and self.right_index and self.how != 'asof':
- join_index, left_indexer, right_indexer = \
- left_ax.join(right_ax, how=self.how, return_indexers=True,
- sort=self.sort)
- elif self.right_index and self.how == 'left':
- join_index, left_indexer, right_indexer = \
- _left_join_on_index(left_ax, right_ax, self.left_join_keys,
- sort=self.sort)
-
- elif self.left_index and self.how == 'right':
- join_index, right_indexer, left_indexer = \
- _left_join_on_index(right_ax, left_ax, self.right_join_keys,
- sort=self.sort)
+ if self.left_index and self.right_index and self.how != "asof":
+ join_index, left_indexer, right_indexer = left_ax.join(
+ right_ax, how=self.how, return_indexers=True, sort=self.sort
+ )
+ elif self.right_index and self.how == "left":
+ join_index, left_indexer, right_indexer = _left_join_on_index(
+ left_ax, right_ax, self.left_join_keys, sort=self.sort
+ )
+
+ elif self.left_index and self.how == "right":
+ join_index, right_indexer, left_indexer = _left_join_on_index(
+ right_ax, left_ax, self.right_join_keys, sort=self.sort
+ )
else:
- (left_indexer,
- right_indexer) = self._get_join_indexers()
+ (left_indexer, right_indexer) = self._get_join_indexers()
if self.right_index:
if len(self.left) > 0:
- join_index = self._create_join_index(self.left.index,
- self.right.index,
- left_indexer,
- right_indexer,
- how='right')
+ join_index = self._create_join_index(
+ self.left.index,
+ self.right.index,
+ left_indexer,
+ right_indexer,
+ how="right",
+ )
else:
join_index = self.right.index.take(right_indexer)
left_indexer = np.array([-1] * len(join_index))
elif self.left_index:
if len(self.right) > 0:
- join_index = self._create_join_index(self.right.index,
- self.left.index,
- right_indexer,
- left_indexer,
- how='left')
+ join_index = self._create_join_index(
+ self.right.index,
+ self.left.index,
+ right_indexer,
+ left_indexer,
+ how="left",
+ )
else:
join_index = self.left.index.take(left_indexer)
right_indexer = np.array([-1] * len(join_index))
@@ -789,8 +890,9 @@ def _get_join_info(self):
join_index = join_index.astype(object)
return join_index, left_indexer, right_indexer
- def _create_join_index(self, index, other_index, indexer,
- other_indexer, how='left'):
+ def _create_join_index(
+ self, index, other_index, indexer, other_indexer, how="left"
+ ):
"""
Create a join index by rearranging one index to match another
@@ -805,8 +907,7 @@ def _create_join_index(self, index, other_index, indexer,
-------
join_index
"""
- if (self.how in (how, 'outer') and
- not isinstance(other_index, MultiIndex)):
+ if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):
# if final index requires values in other_index but not target
# index, indexer may hold missing (-1) values, causing Index.take
# to take the final value in target index. So, we set the last
@@ -863,8 +964,7 @@ def _get_merge_keys(self):
join_names.append(None) # what to do?
else:
if rk is not None:
- right_keys.append(
- right._get_label_or_level_values(rk))
+ right_keys.append(right._get_label_or_level_values(rk))
join_names.append(rk)
else:
# work-around for merge_asof(right_index=True)
@@ -873,8 +973,7 @@ def _get_merge_keys(self):
else:
if not is_rkey(rk):
if rk is not None:
- right_keys.append(
- right._get_label_or_level_values(rk))
+ right_keys.append(right._get_label_or_level_values(rk))
else:
# work-around for merge_asof(right_index=True)
right_keys.append(right.index)
@@ -902,9 +1001,12 @@ def _get_merge_keys(self):
left_keys.append(left._get_label_or_level_values(k))
join_names.append(k)
if isinstance(self.right.index, MultiIndex):
- right_keys = [lev._values.take(lev_codes) for lev, lev_codes
- in zip(self.right.index.levels,
- self.right.index.codes)]
+ right_keys = [
+ lev._values.take(lev_codes)
+ for lev, lev_codes in zip(
+ self.right.index.levels, self.right.index.codes
+ )
+ ]
else:
right_keys = [self.right.index._values]
elif _any(self.right_on):
@@ -916,9 +1018,12 @@ def _get_merge_keys(self):
right_keys.append(right._get_label_or_level_values(k))
join_names.append(k)
if isinstance(self.left.index, MultiIndex):
- left_keys = [lev._values.take(lev_codes) for lev, lev_codes
- in zip(self.left.index.levels,
- self.left.index.codes)]
+ left_keys = [
+ lev._values.take(lev_codes)
+ for lev, lev_codes in zip(
+ self.left.index.levels, self.left.index.codes
+ )
+ ]
else:
left_keys = [self.left.index.values]
@@ -937,9 +1042,9 @@ def _maybe_coerce_merge_keys(self):
# for example if these are categorical, but are not dtype_equal
# or if we have object and integer dtypes
- for lk, rk, name in zip(self.left_join_keys,
- self.right_join_keys,
- self.join_names):
+ for lk, rk, name in zip(
+ self.left_join_keys, self.right_join_keys, self.join_names
+ ):
if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
continue
@@ -960,10 +1065,11 @@ def _maybe_coerce_merge_keys(self):
elif is_dtype_equal(lk.dtype, rk.dtype):
continue
- msg = ("You are trying to merge on {lk_dtype} and "
- "{rk_dtype} columns. If you wish to proceed "
- "you should use pd.concat".format(lk_dtype=lk.dtype,
- rk_dtype=rk.dtype))
+ msg = (
+ "You are trying to merge on {lk_dtype} and "
+ "{rk_dtype} columns. If you wish to proceed "
+ "you should use pd.concat".format(lk_dtype=lk.dtype, rk_dtype=rk.dtype)
+ )
# if we are numeric, then allow differing
# kinds to proceed, eg. int64 and int8, int and float
@@ -976,51 +1082,60 @@ def _maybe_coerce_merge_keys(self):
# check whether ints and floats
elif is_integer_dtype(rk) and is_float_dtype(lk):
if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():
- warnings.warn('You are merging on int and float '
- 'columns where the float values '
- 'are not equal to their int '
- 'representation', UserWarning)
+ warnings.warn(
+ "You are merging on int and float "
+ "columns where the float values "
+ "are not equal to their int "
+ "representation",
+ UserWarning,
+ )
continue
elif is_float_dtype(rk) and is_integer_dtype(lk):
if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():
- warnings.warn('You are merging on int and float '
- 'columns where the float values '
- 'are not equal to their int '
- 'representation', UserWarning)
+ warnings.warn(
+ "You are merging on int and float "
+ "columns where the float values "
+ "are not equal to their int "
+ "representation",
+ UserWarning,
+ )
continue
# let's infer and see if we are ok
- elif (lib.infer_dtype(lk, skipna=False)
- == lib.infer_dtype(rk, skipna=False)):
+ elif lib.infer_dtype(lk, skipna=False) == lib.infer_dtype(
+ rk, skipna=False
+ ):
continue
# Check if we are trying to merge on obviously
# incompatible dtypes GH 9780, GH 15800
# bool values are coerced to object
- elif ((lk_is_object and is_bool_dtype(rk)) or
- (is_bool_dtype(lk) and rk_is_object)):
+ elif (lk_is_object and is_bool_dtype(rk)) or (
+ is_bool_dtype(lk) and rk_is_object
+ ):
pass
# object values are allowed to be merged
- elif ((lk_is_object and is_numeric_dtype(rk)) or
- (is_numeric_dtype(lk) and rk_is_object)):
+ elif (lk_is_object and is_numeric_dtype(rk)) or (
+ is_numeric_dtype(lk) and rk_is_object
+ ):
inferred_left = lib.infer_dtype(lk, skipna=False)
inferred_right = lib.infer_dtype(rk, skipna=False)
- bool_types = ['integer', 'mixed-integer', 'boolean', 'empty']
- string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty']
+ bool_types = ["integer", "mixed-integer", "boolean", "empty"]
+ string_types = ["string", "unicode", "mixed", "bytes", "empty"]
# inferred bool
- if (inferred_left in bool_types and
- inferred_right in bool_types):
+ if inferred_left in bool_types and inferred_right in bool_types:
pass
# unless we are merging non-string-like with string-like
- elif ((inferred_left in string_types and
- inferred_right not in string_types) or
- (inferred_right in string_types and
- inferred_left not in string_types)):
+ elif (
+ inferred_left in string_types and inferred_right not in string_types
+ ) or (
+ inferred_right in string_types and inferred_left not in string_types
+ ):
raise ValueError(msg)
# datetimelikes must match exactly
@@ -1045,12 +1160,10 @@ def _maybe_coerce_merge_keys(self):
# incompatible dtypes. See GH 16900.
if name in self.left.columns:
typ = lk.categories.dtype if lk_is_cat else object
- self.left = self.left.assign(
- **{name: self.left[name].astype(typ)})
+ self.left = self.left.assign(**{name: self.left[name].astype(typ)})
if name in self.right.columns:
typ = rk.categories.dtype if rk_is_cat else object
- self.right = self.right.assign(
- **{name: self.right[name].astype(typ)})
+ self.right = self.right.assign(**{name: self.right[name].astype(typ)})
def _validate_specification(self):
# Hm, any way to make this logic less complicated??
@@ -1060,43 +1173,53 @@ def _validate_specification(self):
self.left_on, self.right_on = (), ()
elif self.left_index:
if self.right_on is None:
- raise MergeError('Must pass right_on or right_index=True')
+ raise MergeError("Must pass right_on or right_index=True")
elif self.right_index:
if self.left_on is None:
- raise MergeError('Must pass left_on or left_index=True')
+ raise MergeError("Must pass left_on or left_index=True")
else:
# use the common columns
- common_cols = self.left.columns.intersection(
- self.right.columns)
+ common_cols = self.left.columns.intersection(self.right.columns)
if len(common_cols) == 0:
raise MergeError(
- 'No common columns to perform merge on. '
- 'Merge options: left_on={lon}, right_on={ron}, '
- 'left_index={lidx}, right_index={ridx}'
- .format(lon=self.left_on, ron=self.right_on,
- lidx=self.left_index, ridx=self.right_index))
+ "No common columns to perform merge on. "
+ "Merge options: left_on={lon}, right_on={ron}, "
+ "left_index={lidx}, right_index={ridx}".format(
+ lon=self.left_on,
+ ron=self.right_on,
+ lidx=self.left_index,
+ ridx=self.right_index,
+ )
+ )
if not common_cols.is_unique:
- raise MergeError("Data columns not unique: {common!r}"
- .format(common=common_cols))
+ raise MergeError(
+ "Data columns not unique: {common!r}".format(common=common_cols)
+ )
self.left_on = self.right_on = common_cols
elif self.on is not None:
if self.left_on is not None or self.right_on is not None:
- raise MergeError('Can only pass argument "on" OR "left_on" '
- 'and "right_on", not a combination of both.')
+ raise MergeError(
+ 'Can only pass argument "on" OR "left_on" '
+ 'and "right_on", not a combination of both.'
+ )
self.left_on = self.right_on = self.on
elif self.left_on is not None:
n = len(self.left_on)
if self.right_index:
if len(self.left_on) != self.right.index.nlevels:
- raise ValueError('len(left_on) must equal the number '
- 'of levels in the index of "right"')
+ raise ValueError(
+ "len(left_on) must equal the number "
+ 'of levels in the index of "right"'
+ )
self.right_on = [None] * n
elif self.right_on is not None:
n = len(self.right_on)
if self.left_index:
if len(self.right_on) != self.left.index.nlevels:
- raise ValueError('len(right_on) must equal the number '
- 'of levels in the index of "left"')
+ raise ValueError(
+ "len(right_on) must equal the number "
+ 'of levels in the index of "left"'
+ )
self.left_on = [None] * n
if len(self.right_on) != len(self.left_on):
raise ValueError("len(right_on) must equal len(left_on)")
@@ -1107,46 +1230,53 @@ def _validate(self, validate):
if self.left_index:
left_unique = self.orig_left.index.is_unique
else:
- left_unique = MultiIndex.from_arrays(self.left_join_keys
- ).is_unique
+ left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
if self.right_index:
right_unique = self.orig_right.index.is_unique
else:
- right_unique = MultiIndex.from_arrays(self.right_join_keys
- ).is_unique
+ right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
# Check data integrity
if validate in ["one_to_one", "1:1"]:
if not left_unique and not right_unique:
- raise MergeError("Merge keys are not unique in either left"
- " or right dataset; not a one-to-one merge")
+ raise MergeError(
+ "Merge keys are not unique in either left"
+ " or right dataset; not a one-to-one merge"
+ )
elif not left_unique:
- raise MergeError("Merge keys are not unique in left dataset;"
- " not a one-to-one merge")
+ raise MergeError(
+ "Merge keys are not unique in left dataset;"
+ " not a one-to-one merge"
+ )
elif not right_unique:
- raise MergeError("Merge keys are not unique in right dataset;"
- " not a one-to-one merge")
+ raise MergeError(
+ "Merge keys are not unique in right dataset;"
+ " not a one-to-one merge"
+ )
elif validate in ["one_to_many", "1:m"]:
if not left_unique:
- raise MergeError("Merge keys are not unique in left dataset;"
- " not a one-to-many merge")
+ raise MergeError(
+ "Merge keys are not unique in left dataset;"
+ " not a one-to-many merge"
+ )
elif validate in ["many_to_one", "m:1"]:
if not right_unique:
- raise MergeError("Merge keys are not unique in right dataset;"
- " not a many-to-one merge")
+ raise MergeError(
+ "Merge keys are not unique in right dataset;"
+ " not a many-to-one merge"
+ )
- elif validate in ['many_to_many', 'm:m']:
+ elif validate in ["many_to_many", "m:m"]:
pass
else:
raise ValueError("Not a valid argument for validate")
-def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
- **kwargs):
+def _get_join_indexers(left_keys, right_keys, sort=False, how="inner", **kwargs):
"""
Parameters
@@ -1164,14 +1294,15 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
"""
from functools import partial
- assert len(left_keys) == len(right_keys), \
- 'left_key and right_keys must be the same length'
+ assert len(left_keys) == len(
+ right_keys
+ ), "left_key and right_keys must be the same length"
# bind `sort` arg. of _factorize_keys
fkeys = partial(_factorize_keys, sort=sort)
# get left & right join labels and num. of levels at each location
- llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys)))
+ llab, rlab, shape = map(list, zip(*map(fkeys, left_keys, right_keys)))
# get flat i8 keys from label lists
lkey, rkey = _get_join_keys(llab, rlab, shape, sort)
@@ -1183,15 +1314,16 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner',
# preserve left frame order if how == 'left' and sort == False
kwargs = copy.copy(kwargs)
- if how == 'left':
- kwargs['sort'] = sort
+ if how == "left":
+ kwargs["sort"] = sort
join_func = _join_functions[how]
return join_func(lkey, rkey, count, **kwargs)
-def _restore_dropped_levels_multijoin(left, right, dropped_level_names,
- join_index, lindexer, rindexer):
+def _restore_dropped_levels_multijoin(
+ left, right, dropped_level_names, join_index, lindexer, rindexer
+):
"""
*this is an internal non-public method*
@@ -1232,8 +1364,7 @@ def _convert_to_mulitindex(index):
if isinstance(index, MultiIndex):
return index
else:
- return MultiIndex.from_arrays([index.values],
- names=[index.name])
+ return MultiIndex.from_arrays([index.values], names=[index.name])
# For multi-multi joins with one overlapping level,
# the returned index if of type Index
@@ -1280,21 +1411,39 @@ def _convert_to_mulitindex(index):
class _OrderedMerge(_MergeOperation):
- _merge_type = 'ordered_merge'
-
- def __init__(self, left, right, on=None, left_on=None, right_on=None,
- left_index=False, right_index=False, axis=1,
- suffixes=('_x', '_y'), copy=True,
- fill_method=None, how='outer'):
+ _merge_type = "ordered_merge"
+
+ def __init__(
+ self,
+ left,
+ right,
+ on=None,
+ left_on=None,
+ right_on=None,
+ left_index=False,
+ right_index=False,
+ axis=1,
+ suffixes=("_x", "_y"),
+ copy=True,
+ fill_method=None,
+ how="outer",
+ ):
self.fill_method = fill_method
- _MergeOperation.__init__(self, left, right, on=on, left_on=left_on,
- left_index=left_index,
- right_index=right_index,
- right_on=right_on, axis=axis,
- how=how, suffixes=suffixes,
- sort=True # factorize sorts
- )
+ _MergeOperation.__init__(
+ self,
+ left,
+ right,
+ on=on,
+ left_on=left_on,
+ left_index=left_index,
+ right_index=right_index,
+ right_on=right_on,
+ axis=axis,
+ how=how,
+ suffixes=suffixes,
+ sort=True, # factorize sorts
+ )
def get_result(self):
join_index, left_indexer, right_indexer = self._get_join_info()
@@ -1303,25 +1452,26 @@ def get_result(self):
ldata, rdata = self.left._data, self.right._data
lsuf, rsuf = self.suffixes
- llabels, rlabels = _items_overlap_with_suffix(ldata.items, lsuf,
- rdata.items, rsuf)
+ llabels, rlabels = _items_overlap_with_suffix(
+ ldata.items, lsuf, rdata.items, rsuf
+ )
- if self.fill_method == 'ffill':
+ if self.fill_method == "ffill":
left_join_indexer = libjoin.ffill_indexer(left_indexer)
right_join_indexer = libjoin.ffill_indexer(right_indexer)
else:
left_join_indexer = left_indexer
right_join_indexer = right_indexer
- lindexers = {
- 1: left_join_indexer} if left_join_indexer is not None else {}
- rindexers = {
- 1: right_join_indexer} if right_join_indexer is not None else {}
+ lindexers = {1: left_join_indexer} if left_join_indexer is not None else {}
+ rindexers = {1: right_join_indexer} if right_join_indexer is not None else {}
result_data = concatenate_block_managers(
[(ldata, lindexers), (rdata, rindexers)],
axes=[llabels.append(rlabels), join_index],
- concat_axis=0, copy=self.copy)
+ concat_axis=0,
+ copy=self.copy,
+ )
typ = self.left._constructor
result = typ(result_data).__finalize__(self, method=self._merge_type)
@@ -1332,43 +1482,56 @@ def get_result(self):
def _asof_function(direction):
- name = 'asof_join_{dir}'.format(dir=direction)
+ name = "asof_join_{dir}".format(dir=direction)
return getattr(libjoin, name, None)
def _asof_by_function(direction):
- name = 'asof_join_{dir}_on_X_by_Y'.format(dir=direction)
+ name = "asof_join_{dir}_on_X_by_Y".format(dir=direction)
return getattr(libjoin, name, None)
_type_casters = {
- 'int64_t': ensure_int64,
- 'double': ensure_float64,
- 'object': ensure_object,
+ "int64_t": ensure_int64,
+ "double": ensure_float64,
+ "object": ensure_object,
}
def _get_cython_type_upcast(dtype):
""" Upcast a dtype to 'int64_t', 'double', or 'object' """
if is_integer_dtype(dtype):
- return 'int64_t'
+ return "int64_t"
elif is_float_dtype(dtype):
- return 'double'
+ return "double"
else:
- return 'object'
+ return "object"
class _AsOfMerge(_OrderedMerge):
- _merge_type = 'asof_merge'
-
- def __init__(self, left, right, on=None, left_on=None, right_on=None,
- left_index=False, right_index=False,
- by=None, left_by=None, right_by=None,
- axis=1, suffixes=('_x', '_y'), copy=True,
- fill_method=None,
- how='asof', tolerance=None,
- allow_exact_matches=True,
- direction='backward'):
+ _merge_type = "asof_merge"
+
+ def __init__(
+ self,
+ left,
+ right,
+ on=None,
+ left_on=None,
+ right_on=None,
+ left_index=False,
+ right_index=False,
+ by=None,
+ left_by=None,
+ right_by=None,
+ axis=1,
+ suffixes=("_x", "_y"),
+ copy=True,
+ fill_method=None,
+ how="asof",
+ tolerance=None,
+ allow_exact_matches=True,
+ direction="backward",
+ ):
self.by = by
self.left_by = left_by
@@ -1377,11 +1540,20 @@ def __init__(self, left, right, on=None, left_on=None, right_on=None,
self.allow_exact_matches = allow_exact_matches
self.direction = direction
- _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on,
- right_on=right_on, left_index=left_index,
- right_index=right_index, axis=axis,
- how=how, suffixes=suffixes,
- fill_method=fill_method)
+ _OrderedMerge.__init__(
+ self,
+ left,
+ right,
+ on=on,
+ left_on=left_on,
+ right_on=right_on,
+ left_index=left_index,
+ right_index=right_index,
+ axis=axis,
+ how=how,
+ suffixes=suffixes,
+ fill_method=fill_method,
+ )
def _validate_specification(self):
super()._validate_specification()
@@ -1402,13 +1574,12 @@ def _validate_specification(self):
# set 'by' columns
if self.by is not None:
if self.left_by is not None or self.right_by is not None:
- raise MergeError('Can only pass by OR left_by '
- 'and right_by')
+ raise MergeError("Can only pass by OR left_by " "and right_by")
self.left_by = self.right_by = self.by
if self.left_by is None and self.right_by is not None:
- raise MergeError('missing left_by')
+ raise MergeError("missing left_by")
if self.left_by is not None and self.right_by is None:
- raise MergeError('missing right_by')
+ raise MergeError("missing right_by")
# add 'by' to our key-list so we can have it in the
# output as a key
@@ -1419,15 +1590,16 @@ def _validate_specification(self):
self.right_by = [self.right_by]
if len(self.left_by) != len(self.right_by):
- raise MergeError('left_by and right_by must be same length')
+ raise MergeError("left_by and right_by must be same length")
self.left_on = self.left_by + list(self.left_on)
self.right_on = self.right_by + list(self.right_on)
# check 'direction' is valid
- if self.direction not in ['backward', 'forward', 'nearest']:
- raise MergeError('direction invalid: {direction}'
- .format(direction=self.direction))
+ if self.direction not in ["backward", "forward", "nearest"]:
+ raise MergeError(
+ "direction invalid: {direction}".format(direction=self.direction)
+ )
@property
def _asof_key(self):
@@ -1437,15 +1609,12 @@ def _asof_key(self):
def _get_merge_keys(self):
# note this function has side effects
- (left_join_keys,
- right_join_keys,
- join_names) = super()._get_merge_keys()
+ (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys()
# validate index types are the same
for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
if not is_dtype_equal(lk.dtype, rk.dtype):
- if (is_categorical_dtype(lk.dtype) and
- is_categorical_dtype(rk.dtype)):
+ if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype):
# The generic error message is confusing for categoricals.
#
# In this function, the join keys include both the original
@@ -1454,15 +1623,19 @@ def _get_merge_keys(self):
# are not supported for the former, but will fail
# later with a ValueError, so we don't *need* to check
# for them here.
- msg = ("incompatible merge keys [{i}] {lkdtype} and "
- "{rkdtype}, both sides category, but not equal ones"
- .format(i=i, lkdtype=repr(lk.dtype),
- rkdtype=repr(rk.dtype)))
+ msg = (
+ "incompatible merge keys [{i}] {lkdtype} and "
+ "{rkdtype}, both sides category, but not equal ones".format(
+ i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype)
+ )
+ )
else:
- msg = ("incompatible merge keys [{i}] {lkdtype} and "
- "{rkdtype}, must be the same type"
- .format(i=i, lkdtype=repr(lk.dtype),
- rkdtype=repr(rk.dtype)))
+ msg = (
+ "incompatible merge keys [{i}] {lkdtype} and "
+ "{rkdtype}, must be the same type".format(
+ i=i, lkdtype=repr(lk.dtype), rkdtype=repr(rk.dtype)
+ )
+ )
raise MergeError(msg)
# validate tolerance; must be a Timedelta if we have a DTI
@@ -1473,10 +1646,12 @@ def _get_merge_keys(self):
else:
lt = left_join_keys[-1]
- msg = ("incompatible tolerance {tolerance}, must be compat "
- "with type {lkdtype}".format(
- tolerance=type(self.tolerance),
- lkdtype=repr(lt.dtype)))
+ msg = (
+ "incompatible tolerance {tolerance}, must be compat "
+ "with type {lkdtype}".format(
+ tolerance=type(self.tolerance), lkdtype=repr(lt.dtype)
+ )
+ )
if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt):
if not isinstance(self.tolerance, Timedelta):
@@ -1511,16 +1686,18 @@ def _get_join_indexers(self):
def flip(xs):
""" unlike np.transpose, this returns an array of tuples """
- labels = list(string.ascii_lowercase[:len(xs)])
+ labels = list(string.ascii_lowercase[: len(xs)])
dtypes = [x.dtype for x in xs]
labeled_dtypes = list(zip(labels, dtypes))
return np.array(list(zip(*xs)), labeled_dtypes)
# values to compare
- left_values = (self.left.index.values if self.left_index else
- self.left_join_keys[-1])
- right_values = (self.right.index.values if self.right_index else
- self.right_join_keys[-1])
+ left_values = (
+ self.left.index.values if self.left_index else self.left_join_keys[-1]
+ )
+ right_values = (
+ self.right.index.values if self.right_index else self.right_join_keys[-1]
+ )
tolerance = self.tolerance
# we require sortedness and non-null values in the join keys
@@ -1529,20 +1706,20 @@ def flip(xs):
if not Index(left_values).is_monotonic:
if isnull(left_values).any():
- raise ValueError(msg_missings.format(side='left'))
+ raise ValueError(msg_missings.format(side="left"))
else:
- raise ValueError(msg_sorted.format(side='left'))
+ raise ValueError(msg_sorted.format(side="left"))
if not Index(right_values).is_monotonic:
if isnull(right_values).any():
- raise ValueError(msg_missings.format(side='right'))
+ raise ValueError(msg_missings.format(side="right"))
else:
- raise ValueError(msg_sorted.format(side='right'))
+ raise ValueError(msg_sorted.format(side="right"))
# initial type conversion as needed
if needs_i8_conversion(left_values):
- left_values = left_values.view('i8')
- right_values = right_values.view('i8')
+ left_values = left_values.view("i8")
+ right_values = right_values.view("i8")
if tolerance is not None:
tolerance = tolerance.value
@@ -1572,19 +1749,18 @@ def flip(xs):
# choose appropriate function by type
func = _asof_by_function(self.direction)
- return func(left_values,
- right_values,
- left_by_values,
- right_by_values,
- self.allow_exact_matches,
- tolerance)
+ return func(
+ left_values,
+ right_values,
+ left_by_values,
+ right_by_values,
+ self.allow_exact_matches,
+ tolerance,
+ )
else:
# choose appropriate function by type
func = _asof_function(self.direction)
- return func(left_values,
- right_values,
- self.allow_exact_matches,
- tolerance)
+ return func(left_values, right_values, self.allow_exact_matches, tolerance)
def _get_multiindex_indexer(join_keys, index, sort):
@@ -1594,13 +1770,11 @@ def _get_multiindex_indexer(join_keys, index, sort):
fkeys = partial(_factorize_keys, sort=sort)
# left & right join labels and num. of levels at each location
- rcodes, lcodes, shape = map(list, zip(* map(fkeys,
- index.levels,
- join_keys)))
+ rcodes, lcodes, shape = map(list, zip(*map(fkeys, index.levels, join_keys)))
if sort:
rcodes = list(map(np.take, rcodes, index.codes))
else:
- i8copy = lambda a: a.astype('i8', subok=False, copy=True)
+ i8copy = lambda a: a.astype("i8", subok=False, copy=True)
rcodes = list(map(i8copy, index.codes))
# fix right labels if there were any nulls
@@ -1628,29 +1802,31 @@ def _get_single_indexer(join_key, index, sort=False):
left_key, right_key, count = _factorize_keys(join_key, index, sort=sort)
left_indexer, right_indexer = libjoin.left_outer_join(
- ensure_int64(left_key),
- ensure_int64(right_key),
- count, sort=sort)
+ ensure_int64(left_key), ensure_int64(right_key), count, sort=sort
+ )
return left_indexer, right_indexer
def _left_join_on_index(left_ax, right_ax, join_keys, sort=False):
if len(join_keys) > 1:
- if not ((isinstance(right_ax, MultiIndex) and
- len(join_keys) == right_ax.nlevels)):
- raise AssertionError("If more than one join key is given then "
- "'right_ax' must be a MultiIndex and the "
- "number of join keys must be the number of "
- "levels in right_ax")
-
- left_indexer, right_indexer = \
- _get_multiindex_indexer(join_keys, right_ax, sort=sort)
+ if not (
+ (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels)
+ ):
+ raise AssertionError(
+ "If more than one join key is given then "
+ "'right_ax' must be a MultiIndex and the "
+ "number of join keys must be the number of "
+ "levels in right_ax"
+ )
+
+ left_indexer, right_indexer = _get_multiindex_indexer(
+ join_keys, right_ax, sort=sort
+ )
else:
jkey = join_keys[0]
- left_indexer, right_indexer = \
- _get_single_indexer(jkey, right_ax, sort=sort)
+ left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort)
if sort or len(left_ax) != len(left_indexer):
# if asked to sort or there are 1-to-many matches
@@ -1667,22 +1843,22 @@ def _right_outer_join(x, y, max_groups):
_join_functions = {
- 'inner': libjoin.inner_join,
- 'left': libjoin.left_outer_join,
- 'right': _right_outer_join,
- 'outer': libjoin.full_outer_join,
+ "inner": libjoin.inner_join,
+ "left": libjoin.left_outer_join,
+ "right": _right_outer_join,
+ "outer": libjoin.full_outer_join,
}
def _factorize_keys(lk, rk, sort=True):
# Some pre-processing for non-ndarray lk / rk
if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk):
- lk = getattr(lk, '_values', lk)._data
- rk = getattr(rk, '_values', rk)._data
+ lk = getattr(lk, "_values", lk)._data
+ rk = getattr(rk, "_values", rk)._data
- elif (is_categorical_dtype(lk) and
- is_categorical_dtype(rk) and
- lk.is_dtype_equal(rk)):
+ elif (
+ is_categorical_dtype(lk) and is_categorical_dtype(rk) and lk.is_dtype_equal(rk)
+ ):
if lk.categories.equals(rk.categories):
# if we exactly match in categories, allow us to factorize on codes
rk = rk.codes
@@ -1693,9 +1869,11 @@ def _factorize_keys(lk, rk, sort=True):
lk = ensure_int64(lk.codes)
rk = ensure_int64(rk)
- elif (is_extension_array_dtype(lk.dtype) and
- is_extension_array_dtype(rk.dtype) and
- lk.dtype == rk.dtype):
+ elif (
+ is_extension_array_dtype(lk.dtype)
+ and is_extension_array_dtype(rk.dtype)
+ and lk.dtype == rk.dtype
+ ):
lk, _ = lk._values_for_factorize()
rk, _ = rk._values_for_factorize()
@@ -1705,8 +1883,9 @@ def _factorize_keys(lk, rk, sort=True):
klass = libhashtable.Int64Factorizer
lk = ensure_int64(com.values_from_object(lk))
rk = ensure_int64(com.values_from_object(rk))
- elif (issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and
- issubclass(rk.dtype.type, (np.timedelta64, np.datetime64))):
+ elif issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and issubclass(
+ rk.dtype.type, (np.timedelta64, np.datetime64)
+ ):
# GH#23917 TODO: Needs tests for non-matching dtypes
klass = libhashtable.Int64Factorizer
lk = ensure_int64(com.values_from_object(lk))
@@ -1765,12 +1944,12 @@ def _get_join_keys(llab, rlab, shape, sort):
nlev = next(filter(pred, range(len(shape), 0, -1)))
# get keys for the first `nlev` levels
- stride = np.prod(shape[1:nlev], dtype='i8')
- lkey = stride * llab[0].astype('i8', subok=False, copy=False)
- rkey = stride * rlab[0].astype('i8', subok=False, copy=False)
+ stride = np.prod(shape[1:nlev], dtype="i8")
+ lkey = stride * llab[0].astype("i8", subok=False, copy=False)
+ rkey = stride * rlab[0].astype("i8", subok=False, copy=False)
for i in range(1, nlev):
- with np.errstate(divide='ignore'):
+ with np.errstate(divide="ignore"):
stride //= shape[i]
lkey += llab[i] * stride
rkey += rlab[i] * stride
@@ -1803,12 +1982,14 @@ def validate_operand(obj):
return obj
elif isinstance(obj, Series):
if obj.name is None:
- raise ValueError('Cannot merge a Series without a name')
+ raise ValueError("Cannot merge a Series without a name")
else:
return obj.to_frame()
else:
- raise TypeError('Can only merge Series or DataFrame objects, '
- 'a {obj} was passed'.format(obj=type(obj)))
+ raise TypeError(
+ "Can only merge Series or DataFrame objects, "
+ "a {obj} was passed".format(obj=type(obj))
+ )
def _items_overlap_with_suffix(left, lsuffix, right, rsuffix):
@@ -1823,8 +2004,10 @@ def _items_overlap_with_suffix(left, lsuffix, right, rsuffix):
return left, right
if not lsuffix and not rsuffix:
- raise ValueError('columns overlap but no suffix specified: '
- '{rename}'.format(rename=to_rename))
+ raise ValueError(
+ "columns overlap but no suffix specified: "
+ "{rename}".format(rename=to_rename)
+ )
def renamer(x, suffix):
"""
@@ -1843,11 +2026,10 @@ def renamer(x, suffix):
x : renamed column name
"""
if x in to_rename and suffix is not None:
- return '{x}{suffix}'.format(x=x, suffix=suffix)
+ return "{x}{suffix}".format(x=x, suffix=suffix)
return x
lrenamer = partial(renamer, suffix=lsuffix)
rrenamer = partial(renamer, suffix=rsuffix)
- return (_transform_index(left, lrenamer),
- _transform_index(right, rrenamer))
+ return (_transform_index(left, lrenamer), _transform_index(right, rrenamer))
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index 6374dd1b463f3..188f2edd96590 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -17,11 +17,20 @@
# Note: We need to make sure `frame` is imported before `pivot`, otherwise
# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency
-@Substitution('\ndata : DataFrame')
-@Appender(_shared_docs['pivot_table'], indents=1)
-def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
- fill_value=None, margins=False, dropna=True,
- margins_name='All', observed=False):
+@Substitution("\ndata : DataFrame")
+@Appender(_shared_docs["pivot_table"], indents=1)
+def pivot_table(
+ data,
+ values=None,
+ index=None,
+ columns=None,
+ aggfunc="mean",
+ fill_value=None,
+ margins=False,
+ dropna=True,
+ margins_name="All",
+ observed=False,
+):
index = _convert_by(index)
columns = _convert_by(columns)
@@ -29,14 +38,20 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
pieces = []
keys = []
for func in aggfunc:
- table = pivot_table(data, values=values, index=index,
- columns=columns,
- fill_value=fill_value, aggfunc=func,
- margins=margins, dropna=dropna,
- margins_name=margins_name,
- observed=observed)
+ table = pivot_table(
+ data,
+ values=values,
+ index=index,
+ columns=columns,
+ fill_value=fill_value,
+ aggfunc=func,
+ margins=margins,
+ dropna=dropna,
+ margins_name=margins_name,
+ observed=observed,
+ )
pieces.append(table)
- keys.append(getattr(func, '__name__', func))
+ keys.append(getattr(func, "__name__", func))
return concat(pieces, keys=keys, axis=1)
@@ -80,7 +95,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
grouped = data.groupby(keys, observed=observed)
agged = grouped.agg(aggfunc)
if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
- agged = agged.dropna(how='all')
+ agged = agged.dropna(how="all")
# gh-21133
# we want to down cast if
@@ -88,8 +103,12 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
# as we grouped with a NaN value
# and then dropped, coercing to floats
for v in values:
- if (v in data and is_integer_dtype(data[v]) and
- v in agged and not is_integer_dtype(agged[v])):
+ if (
+ v in data
+ and is_integer_dtype(data[v])
+ and v in agged
+ and not is_integer_dtype(agged[v])
+ ):
agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype)
table = agged
@@ -97,7 +116,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
# Related GH #17123
# If index_names are integers, determine whether the integers refer
# to the level position or name.
- index_names = agged.index.names[:len(index)]
+ index_names = agged.index.names[: len(index)]
to_unstack = []
for i in range(len(index), len(keys)):
name = agged.index.names[i]
@@ -109,33 +128,47 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
if not dropna:
from pandas import MultiIndex
+
if table.index.nlevels > 1:
- m = MultiIndex.from_arrays(cartesian_product(table.index.levels),
- names=table.index.names)
+ m = MultiIndex.from_arrays(
+ cartesian_product(table.index.levels), names=table.index.names
+ )
table = table.reindex(m, axis=0)
if table.columns.nlevels > 1:
- m = MultiIndex.from_arrays(cartesian_product(table.columns.levels),
- names=table.columns.names)
+ m = MultiIndex.from_arrays(
+ cartesian_product(table.columns.levels), names=table.columns.names
+ )
table = table.reindex(m, axis=1)
if isinstance(table, ABCDataFrame):
table = table.sort_index(axis=1)
if fill_value is not None:
- table = table.fillna(value=fill_value, downcast='infer')
+ table = table.fillna(value=fill_value, downcast="infer")
if margins:
if dropna:
data = data[data.notna().all(axis=1)]
- table = _add_margins(table, data, values, rows=index,
- cols=columns, aggfunc=aggfunc,
- observed=dropna,
- margins_name=margins_name, fill_value=fill_value)
+ table = _add_margins(
+ table,
+ data,
+ values,
+ rows=index,
+ cols=columns,
+ aggfunc=aggfunc,
+ observed=dropna,
+ margins_name=margins_name,
+ fill_value=fill_value,
+ )
# discard the top level
- if (values_passed and not values_multi and not table.empty and
- (table.columns.nlevels > 1)):
+ if (
+ values_passed
+ and not values_multi
+ and not table.empty
+ and (table.columns.nlevels > 1)
+ ):
table = table[values[0]]
if len(index) == 0 and len(columns) > 0:
@@ -143,15 +176,24 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
# GH 15193 Make sure empty columns are removed if dropna=True
if isinstance(table, ABCDataFrame) and dropna:
- table = table.dropna(how='all', axis=1)
+ table = table.dropna(how="all", axis=1)
return table
-def _add_margins(table, data, values, rows, cols, aggfunc,
- observed=None, margins_name='All', fill_value=None):
+def _add_margins(
+ table,
+ data,
+ values,
+ rows,
+ cols,
+ aggfunc,
+ observed=None,
+ margins_name="All",
+ fill_value=None,
+):
if not isinstance(margins_name, str):
- raise ValueError('margins_name argument must be a string')
+ raise ValueError("margins_name argument must be a string")
msg = 'Conflicting name "{name}" in margins'.format(name=margins_name)
for level in table.index.names:
@@ -161,13 +203,13 @@ def _add_margins(table, data, values, rows, cols, aggfunc,
grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)
# could be passed a Series object with no 'columns'
- if hasattr(table, 'columns'):
+ if hasattr(table, "columns"):
for level in table.columns.names[1:]:
if margins_name in table.columns.get_level_values(level):
raise ValueError(msg)
if len(rows) > 1:
- key = (margins_name,) + ('',) * (len(rows) - 1)
+ key = (margins_name,) + ("",) * (len(rows) - 1)
else:
key = margins_name
@@ -177,17 +219,24 @@ def _add_margins(table, data, values, rows, cols, aggfunc,
return table.append(Series({key: grand_margin[margins_name]}))
if values:
- marginal_result_set = _generate_marginal_results(table, data, values,
- rows, cols, aggfunc,
- observed,
- grand_margin,
- margins_name)
+ marginal_result_set = _generate_marginal_results(
+ table,
+ data,
+ values,
+ rows,
+ cols,
+ aggfunc,
+ observed,
+ grand_margin,
+ margins_name,
+ )
if not isinstance(marginal_result_set, tuple):
return marginal_result_set
result, margin_keys, row_margin = marginal_result_set
else:
marginal_result_set = _generate_marginal_results_without_values(
- table, data, rows, cols, aggfunc, observed, margins_name)
+ table, data, rows, cols, aggfunc, observed, margins_name
+ )
if not isinstance(marginal_result_set, tuple):
return marginal_result_set
result, margin_keys, row_margin = marginal_result_set
@@ -200,6 +249,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc,
row_margin[k] = grand_margin[k[0]]
from pandas import DataFrame
+
margin_dummy = DataFrame(row_margin, columns=[key]).T
row_names = result.index.names
@@ -218,8 +268,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc,
return result
-def _compute_grand_margin(data, values, aggfunc,
- margins_name='All'):
+def _compute_grand_margin(data, values, aggfunc, margins_name="All"):
if values:
grand_margin = {}
@@ -241,26 +290,22 @@ def _compute_grand_margin(data, values, aggfunc,
return {margins_name: aggfunc(data.index)}
-def _generate_marginal_results(table, data, values, rows, cols, aggfunc,
- observed,
- grand_margin,
- margins_name='All'):
+def _generate_marginal_results(
+ table, data, values, rows, cols, aggfunc, observed, grand_margin, margins_name="All"
+):
if len(cols) > 0:
# need to "interleave" the margins
table_pieces = []
margin_keys = []
def _all_key(key):
- return (key, margins_name) + ('',) * (len(cols) - 1)
+ return (key, margins_name) + ("",) * (len(cols) - 1)
if len(rows) > 0:
- margin = data[rows + values].groupby(
- rows, observed=observed).agg(aggfunc)
+ margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc)
cat_axis = 1
- for key, piece in table.groupby(level=0,
- axis=cat_axis,
- observed=observed):
+ for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
all_key = _all_key(key)
# we are going to mutate this, so need to copy!
@@ -270,9 +315,11 @@ def _all_key(key):
except TypeError:
# we cannot reshape, so coerce the axis
- piece.set_axis(piece._get_axis(
- cat_axis)._to_safe_for_reshape(),
- axis=cat_axis, inplace=True)
+ piece.set_axis(
+ piece._get_axis(cat_axis)._to_safe_for_reshape(),
+ axis=cat_axis,
+ inplace=True,
+ )
piece[all_key] = margin[key]
table_pieces.append(piece)
@@ -280,9 +327,7 @@ def _all_key(key):
else:
margin = grand_margin
cat_axis = 0
- for key, piece in table.groupby(level=0,
- axis=cat_axis,
- observed=observed):
+ for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
all_key = _all_key(key)
table_pieces.append(piece)
table_pieces.append(Series(margin[key], index=[all_key]))
@@ -297,8 +342,7 @@ def _all_key(key):
margin_keys = table.columns
if len(cols) > 0:
- row_margin = data[cols + values].groupby(
- cols, observed=observed).agg(aggfunc)
+ row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc)
row_margin = row_margin.stack()
# slight hack
@@ -311,8 +355,8 @@ def _all_key(key):
def _generate_marginal_results_without_values(
- table, data, rows, cols, aggfunc,
- observed, margins_name='All'):
+ table, data, rows, cols, aggfunc, observed, margins_name="All"
+):
if len(cols) > 0:
# need to "interleave" the margins
margin_keys = []
@@ -320,20 +364,17 @@ def _generate_marginal_results_without_values(
def _all_key():
if len(cols) == 1:
return margins_name
- return (margins_name, ) + ('', ) * (len(cols) - 1)
+ return (margins_name,) + ("",) * (len(cols) - 1)
if len(rows) > 0:
- margin = data[rows].groupby(rows,
- observed=observed).apply(aggfunc)
+ margin = data[rows].groupby(rows, observed=observed).apply(aggfunc)
all_key = _all_key()
table[all_key] = margin
result = table
margin_keys.append(all_key)
else:
- margin = data.groupby(level=0,
- axis=0,
- observed=observed).apply(aggfunc)
+ margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc)
all_key = _all_key()
table[all_key] = margin
result = table
@@ -354,17 +395,19 @@ def _all_key():
def _convert_by(by):
if by is None:
by = []
- elif (is_scalar(by) or
- isinstance(by, (np.ndarray, Index, ABCSeries, Grouper)) or
- hasattr(by, '__call__')):
+ elif (
+ is_scalar(by)
+ or isinstance(by, (np.ndarray, Index, ABCSeries, Grouper))
+ or hasattr(by, "__call__")
+ ):
by = [by]
else:
by = list(by)
return by
-@Substitution('\ndata : DataFrame')
-@Appender(_shared_docs['pivot'], indents=1)
+@Substitution("\ndata : DataFrame")
+@Appender(_shared_docs["pivot"], indents=1)
def pivot(data, index=None, columns=None, values=None):
if values is None:
cols = [columns] if index is None else [index, columns]
@@ -379,17 +422,26 @@ def pivot(data, index=None, columns=None, values=None):
if is_list_like(values) and not isinstance(values, tuple):
# Exclude tuple because it is seen as a single column name
- indexed = data._constructor(data[values].values, index=index,
- columns=values)
+ indexed = data._constructor(
+ data[values].values, index=index, columns=values
+ )
else:
- indexed = data._constructor_sliced(data[values].values,
- index=index)
+ indexed = data._constructor_sliced(data[values].values, index=index)
return indexed.unstack(columns)
-def crosstab(index, columns, values=None, rownames=None, colnames=None,
- aggfunc=None, margins=False, margins_name='All', dropna=True,
- normalize=False):
+def crosstab(
+ index,
+ columns,
+ values=None,
+ rownames=None,
+ colnames=None,
+ aggfunc=None,
+ margins=False,
+ margins_name="All",
+ dropna=True,
+ normalize=False,
+):
"""
Compute a simple cross tabulation of two (or more) factors. By default
computes a frequency table of the factors unless an array of values and an
@@ -490,11 +542,10 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
index = com.maybe_make_list(index)
columns = com.maybe_make_list(columns)
- rownames = _get_names(index, rownames, prefix='row')
- colnames = _get_names(columns, colnames, prefix='col')
+ rownames = _get_names(index, rownames, prefix="row")
+ colnames = _get_names(columns, colnames, prefix="col")
- common_idx = _get_objs_combined_axis(index + columns, intersect=True,
- sort=False)
+ common_idx = _get_objs_combined_axis(index + columns, intersect=True, sort=False)
data = {}
data.update(zip(rownames, index))
@@ -507,30 +558,38 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
raise ValueError("values cannot be used without an aggfunc.")
from pandas import DataFrame
+
df = DataFrame(data, index=common_idx)
if values is None:
- df['__dummy__'] = 0
- kwargs = {'aggfunc': len, 'fill_value': 0}
+ df["__dummy__"] = 0
+ kwargs = {"aggfunc": len, "fill_value": 0}
else:
- df['__dummy__'] = values
- kwargs = {'aggfunc': aggfunc}
-
- table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
- margins=margins, margins_name=margins_name,
- dropna=dropna, **kwargs)
+ df["__dummy__"] = values
+ kwargs = {"aggfunc": aggfunc}
+
+ table = df.pivot_table(
+ "__dummy__",
+ index=rownames,
+ columns=colnames,
+ margins=margins,
+ margins_name=margins_name,
+ dropna=dropna,
+ **kwargs
+ )
# Post-process
if normalize is not False:
- table = _normalize(table, normalize=normalize, margins=margins,
- margins_name=margins_name)
+ table = _normalize(
+ table, normalize=normalize, margins=margins, margins_name=margins_name
+ )
return table
-def _normalize(table, normalize, margins, margins_name='All'):
+def _normalize(table, normalize, margins, margins_name="All"):
if not isinstance(normalize, (bool, str)):
- axis_subs = {0: 'index', 1: 'columns'}
+ axis_subs = {0: "index", 1: "columns"}
try:
normalize = axis_subs[normalize]
except KeyError:
@@ -540,12 +599,12 @@ def _normalize(table, normalize, margins, margins_name='All'):
# Actual Normalizations
normalizers = {
- 'all': lambda x: x / x.sum(axis=1).sum(axis=0),
- 'columns': lambda x: x / x.sum(),
- 'index': lambda x: x.div(x.sum(axis=1), axis=0)
+ "all": lambda x: x / x.sum(axis=1).sum(axis=0),
+ "columns": lambda x: x / x.sum(),
+ "index": lambda x: x.div(x.sum(axis=1), axis=0),
}
- normalizers[True] = normalizers['all']
+ normalizers[True] = normalizers["all"]
try:
f = normalizers[normalize]
@@ -568,12 +627,12 @@ def _normalize(table, normalize, margins, margins_name='All'):
table = _normalize(table, normalize=normalize, margins=False)
# Fix Margins
- if normalize == 'columns':
+ if normalize == "columns":
column_margin = column_margin / column_margin.sum()
table = concat([table, column_margin], axis=1)
table = table.fillna(0)
- elif normalize == 'index':
+ elif normalize == "index":
index_margin = index_margin / index_margin.sum()
table = table.append(index_margin)
table = table.fillna(0)
@@ -599,17 +658,17 @@ def _normalize(table, normalize, margins, margins_name='All'):
return table
-def _get_names(arrs, names, prefix='row'):
+def _get_names(arrs, names, prefix="row"):
if names is None:
names = []
for i, arr in enumerate(arrs):
if isinstance(arr, ABCSeries) and arr.name is not None:
names.append(arr.name)
else:
- names.append('{prefix}_{i}'.format(prefix=prefix, i=i))
+ names.append("{prefix}_{i}".format(prefix=prefix, i=i))
else:
if len(names) != len(arrs):
- raise AssertionError('arrays and names must have the same length')
+ raise AssertionError("arrays and names must have the same length")
if not isinstance(names, list):
names = list(names)
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index c59f9ffc48055..5d932d7ded9b8 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -9,8 +9,14 @@
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.common import (
- ensure_platform_int, is_bool_dtype, is_extension_array_dtype,
- is_integer_dtype, is_list_like, is_object_dtype, needs_i8_conversion)
+ ensure_platform_int,
+ is_bool_dtype,
+ is_extension_array_dtype,
+ is_integer_dtype,
+ is_list_like,
+ is_object_dtype,
+ needs_i8_conversion,
+)
from pandas.core.dtypes.missing import notna
import pandas.core.algorithms as algos
@@ -21,8 +27,11 @@
from pandas.core.internals.arrays import extract_array
from pandas.core.series import Series
from pandas.core.sorting import (
- compress_group_index, decons_obs_group_ids, get_compressed_ids,
- get_group_index)
+ compress_group_index,
+ decons_obs_group_ids,
+ get_compressed_ids,
+ get_group_index,
+)
class _Unstacker:
@@ -76,8 +85,15 @@ class _Unstacker:
unstacked : DataFrame
"""
- def __init__(self, values, index, level=-1, value_columns=None,
- fill_value=None, constructor=None):
+ def __init__(
+ self,
+ values,
+ index,
+ level=-1,
+ value_columns=None,
+ fill_value=None,
+ constructor=None,
+ ):
if values.ndim == 1:
values = values[:, np.newaxis]
@@ -90,7 +106,7 @@ def __init__(self, values, index, level=-1, value_columns=None,
self.constructor = constructor
if value_columns is None and values.shape[1] != 1: # pragma: no cover
- raise ValueError('must pass column labels for multi-column data')
+ raise ValueError("must pass column labels for multi-column data")
self.index = index.remove_unused_levels()
@@ -110,16 +126,16 @@ def __init__(self, values, index, level=-1, value_columns=None,
# If the data frame is too big, the number of unique index combination
# will cause int32 overflow on windows environments.
# We want to check and raise an error before this happens
- num_rows = np.max([index_level.size for index_level
- in self.new_index_levels])
+ num_rows = np.max([index_level.size for index_level in self.new_index_levels])
num_columns = self.removed_level.size
# GH20601: This forces an overflow if the number of cells is too high.
num_cells = np.multiply(num_rows, num_columns, dtype=np.int32)
if num_rows > 0 and num_columns > 0 and num_cells <= 0:
- raise ValueError('Unstacked DataFrame is too big, '
- 'causing int32 overflow')
+ raise ValueError(
+ "Unstacked DataFrame is too big, " "causing int32 overflow"
+ )
self._make_sorted_values_labels()
self._make_selectors()
@@ -129,8 +145,8 @@ def _make_sorted_values_labels(self):
codes = list(self.index.codes)
levs = list(self.index.levels)
- to_sort = codes[:v] + codes[v + 1:] + [codes[v]]
- sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]
+ to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
+ sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]]
comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
ngroups = len(obs_ids)
@@ -160,8 +176,7 @@ def _make_selectors(self):
mask.put(selector, True)
if mask.sum() < len(self.index):
- raise ValueError('Index contains duplicate entries, '
- 'cannot reshape')
+ raise ValueError("Index contains duplicate entries, " "cannot reshape")
self.group_index = comp_index
self.mask = mask
@@ -188,11 +203,11 @@ def get_new_values(self):
# we can simply reshape if we don't have a mask
if mask_all and len(values):
- new_values = (self.sorted_values
- .reshape(length, width, stride)
- .swapaxes(1, 2)
- .reshape(result_shape)
- )
+ new_values = (
+ self.sorted_values.reshape(length, width, stride)
+ .swapaxes(1, 2)
+ .reshape(result_shape)
+ )
new_mask = np.ones(result_shape, dtype=bool)
return new_values, new_mask
@@ -214,25 +229,27 @@ def get_new_values(self):
# and possibly coerce an input to our output dtype
# e.g. ints -> floats
if needs_i8_conversion(values):
- sorted_values = sorted_values.view('i8')
- new_values = new_values.view('i8')
- name = 'int64'
+ sorted_values = sorted_values.view("i8")
+ new_values = new_values.view("i8")
+ name = "int64"
elif is_bool_dtype(values):
- sorted_values = sorted_values.astype('object')
- new_values = new_values.astype('object')
- name = 'object'
+ sorted_values = sorted_values.astype("object")
+ new_values = new_values.astype("object")
+ name = "object"
else:
sorted_values = sorted_values.astype(name, copy=False)
# fill in our values & mask
f = getattr(_reshape, "unstack_{name}".format(name=name))
- f(sorted_values,
- mask.view('u1'),
- stride,
- length,
- width,
- new_values,
- new_mask.view('u1'))
+ f(
+ sorted_values,
+ mask.view("u1"),
+ stride,
+ length,
+ width,
+ new_values,
+ new_mask.view("u1"),
+ )
# reconstruct dtype if needed
if needs_i8_conversion(values):
@@ -255,8 +272,7 @@ def get_new_columns(self):
new_levels = self.value_columns.levels + (self.removed_level_full,)
new_names = self.value_columns.names + (self.removed_name,)
- new_codes = [lab.take(propagator)
- for lab in self.value_columns.codes]
+ new_codes = [lab.take(propagator) for lab in self.value_columns.codes]
else:
new_levels = [self.value_columns, self.removed_level_full]
new_names = [self.value_columns.name, self.removed_name]
@@ -274,12 +290,12 @@ def get_new_columns(self):
# The entire level is then just a repetition of the single chunk:
new_codes.append(np.tile(repeater, width))
- return MultiIndex(levels=new_levels, codes=new_codes,
- names=new_names, verify_integrity=False)
+ return MultiIndex(
+ levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
+ )
def get_new_index(self):
- result_codes = [lab.take(self.compressor)
- for lab in self.sorted_labels[:-1]]
+ result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
# construct the new index
if len(self.new_index_levels) == 1:
@@ -288,8 +304,12 @@ def get_new_index(self):
lev = lev.insert(len(lev), lev._na_value)
return lev.take(lab)
- return MultiIndex(levels=self.new_index_levels, codes=result_codes,
- names=self.new_index_names, verify_integrity=False)
+ return MultiIndex(
+ levels=self.new_index_levels,
+ codes=result_codes,
+ names=self.new_index_names,
+ verify_integrity=False,
+ )
def _unstack_multiple(data, clocs, fill_value=None):
@@ -315,23 +335,24 @@ def _unstack_multiple(data, clocs, fill_value=None):
group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
comp_ids, obs_ids = compress_group_index(group_index, sort=False)
- recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes,
- xnull=False)
+ recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
if rlocs == []:
# Everything is in clocs, so the dummy df has a regular index
- dummy_index = Index(obs_ids, name='__placeholder__')
+ dummy_index = Index(obs_ids, name="__placeholder__")
else:
- dummy_index = MultiIndex(levels=rlevels + [obs_ids],
- codes=rcodes + [comp_ids],
- names=rnames + ['__placeholder__'],
- verify_integrity=False)
+ dummy_index = MultiIndex(
+ levels=rlevels + [obs_ids],
+ codes=rcodes + [comp_ids],
+ names=rnames + ["__placeholder__"],
+ verify_integrity=False,
+ )
if isinstance(data, Series):
dummy = data.copy()
dummy.index = dummy_index
- unstacked = dummy.unstack('__placeholder__', fill_value=fill_value)
+ unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
new_levels = clevels
new_names = cnames
new_codes = recons_codes
@@ -348,7 +369,7 @@ def _unstack_multiple(data, clocs, fill_value=None):
dummy = data.copy()
dummy.index = dummy_index
- unstacked = dummy.unstack('__placeholder__', fill_value=fill_value)
+ unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
if isinstance(unstacked, Series):
unstcols = unstacked.index
else:
@@ -360,8 +381,9 @@ def _unstack_multiple(data, clocs, fill_value=None):
for rec in recons_codes:
new_codes.append(rec.take(unstcols.codes[-1]))
- new_columns = MultiIndex(levels=new_levels, codes=new_codes,
- names=new_names, verify_integrity=False)
+ new_columns = MultiIndex(
+ levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
+ )
if isinstance(unstacked, Series):
unstacked.index = new_columns
@@ -388,24 +410,32 @@ def unstack(obj, level, fill_value=None):
else:
if is_extension_array_dtype(obj.dtype):
return _unstack_extension_series(obj, level, fill_value)
- unstacker = _Unstacker(obj.values, obj.index, level=level,
- fill_value=fill_value,
- constructor=obj._constructor_expanddim)
+ unstacker = _Unstacker(
+ obj.values,
+ obj.index,
+ level=level,
+ fill_value=fill_value,
+ constructor=obj._constructor_expanddim,
+ )
return unstacker.get_result()
def _unstack_frame(obj, level, fill_value=None):
if obj._is_mixed_type:
- unstacker = partial(_Unstacker, index=obj.index,
- level=level, fill_value=fill_value)
- blocks = obj._data.unstack(unstacker,
- fill_value=fill_value)
+ unstacker = partial(
+ _Unstacker, index=obj.index, level=level, fill_value=fill_value
+ )
+ blocks = obj._data.unstack(unstacker, fill_value=fill_value)
return obj._constructor(blocks)
else:
- unstacker = _Unstacker(obj.values, obj.index, level=level,
- value_columns=obj.columns,
- fill_value=fill_value,
- constructor=obj._constructor)
+ unstacker = _Unstacker(
+ obj.values,
+ obj.index,
+ level=level,
+ value_columns=obj.columns,
+ fill_value=fill_value,
+ constructor=obj._constructor,
+ )
return unstacker.get_result()
@@ -441,18 +471,22 @@ def _unstack_extension_series(series, level, fill_value):
dummy_arr = np.arange(len(series))
# fill_value=-1, since we will do a series.values.take later
- result = _Unstacker(dummy_arr, series.index,
- level=level, fill_value=-1).get_result()
+ result = _Unstacker(
+ dummy_arr, series.index, level=level, fill_value=-1
+ ).get_result()
out = []
values = extract_array(series, extract_numpy=False)
for col, indices in result.iteritems():
- out.append(Series(values.take(indices.values,
- allow_fill=True,
- fill_value=fill_value),
- name=col, index=result.index))
- return concat(out, axis='columns', copy=False, keys=result.columns)
+ out.append(
+ Series(
+ values.take(indices.values, allow_fill=True, fill_value=fill_value),
+ name=col,
+ index=result.index,
+ )
+ )
+ return concat(out, axis="columns", copy=False, keys=result.columns)
def stack(frame, level=-1, dropna=True):
@@ -464,6 +498,7 @@ def stack(frame, level=-1, dropna=True):
-------
stacked : Series
"""
+
def factorize(index):
if index.is_unique:
return index, np.arange(len(index))
@@ -487,15 +522,18 @@ def factorize(index):
new_names = list(frame.index.names)
new_names.append(frame.columns.name)
- new_index = MultiIndex(levels=new_levels, codes=new_codes,
- names=new_names, verify_integrity=False)
+ new_index = MultiIndex(
+ levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
+ )
else:
- levels, (ilab, clab) = zip(*map(factorize, (frame.index,
- frame.columns)))
+ levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns)))
codes = ilab.repeat(K), np.tile(clab, N).ravel()
- new_index = MultiIndex(levels=levels, codes=codes,
- names=[frame.index.name, frame.columns.name],
- verify_integrity=False)
+ new_index = MultiIndex(
+ levels=levels,
+ codes=codes,
+ names=[frame.index.name, frame.columns.name],
+ verify_integrity=False,
+ )
if frame._is_homogeneous_type:
# For homogeneous EAs, frame.values will coerce to object. So
@@ -505,9 +543,9 @@ def factorize(index):
if is_extension_array_dtype(dtype):
arr = dtype.construct_array_type()
- new_values = arr._concat_same_type([
- col._values for _, col in frame.iteritems()
- ])
+ new_values = arr._concat_same_type(
+ [col._values for _, col in frame.iteritems()]
+ )
new_values = _reorder_for_extension_array_stack(new_values, N, K)
else:
# homogeneous, non-EA
@@ -558,8 +596,10 @@ def stack_multiple(frame, level, dropna=True):
level = updated_level
else:
- raise ValueError("level should contain all level names or all level "
- "numbers, not a mixture of the two.")
+ raise ValueError(
+ "level should contain all level names or all level "
+ "numbers, not a mixture of the two."
+ )
return result
@@ -604,9 +644,16 @@ def _convert_level_number(level_num, columns):
# tuple list excluding level for grouping columns
if len(frame.columns.levels) > 2:
- tuples = list(zip(*[lev.take(level_codes) for lev, level_codes
- in zip(this.columns.levels[:-1],
- this.columns.codes[:-1])]))
+ tuples = list(
+ zip(
+ *[
+ lev.take(level_codes)
+ for lev, level_codes in zip(
+ this.columns.levels[:-1], this.columns.codes[:-1]
+ )
+ ]
+ )
+ )
unique_groups = [key for key, _ in itertools.groupby(tuples)]
new_names = this.columns.names[:-1]
new_columns = MultiIndex.from_tuples(unique_groups, names=new_names)
@@ -641,8 +688,9 @@ def _convert_level_number(level_num, columns):
chunk.columns = level_vals.take(chunk.columns.codes[-1])
value_slice = chunk.reindex(columns=level_vals_used).values
else:
- if (frame._is_homogeneous_type and
- is_extension_array_dtype(frame.dtypes.iloc[0])):
+ if frame._is_homogeneous_type and is_extension_array_dtype(
+ frame.dtypes.iloc[0]
+ ):
dtype = this[this.columns[loc]].dtypes.iloc[0]
subset = this[this.columns[loc]]
@@ -682,21 +730,30 @@ def _convert_level_number(level_num, columns):
new_codes.append(np.tile(level_codes, N))
new_names.append(frame.columns.names[level_num])
- new_index = MultiIndex(levels=new_levels, codes=new_codes,
- names=new_names, verify_integrity=False)
+ new_index = MultiIndex(
+ levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
+ )
result = frame._constructor(new_data, index=new_index, columns=new_columns)
# more efficient way to go about this? can do the whole masking biz but
# will only save a small amount of time...
if dropna:
- result = result.dropna(axis=0, how='all')
+ result = result.dropna(axis=0, how="all")
return result
-def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
- columns=None, sparse=False, drop_first=False, dtype=None):
+def get_dummies(
+ data,
+ prefix=None,
+ prefix_sep="_",
+ dummy_na=False,
+ columns=None,
+ sparse=False,
+ drop_first=False,
+ dtype=None,
+):
"""
Convert categorical variable into dummy/indicator variables.
@@ -800,29 +857,31 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
from pandas.core.reshape.concat import concat
from itertools import cycle
- dtypes_to_encode = ['object', 'category']
+ dtypes_to_encode = ["object", "category"]
if isinstance(data, DataFrame):
# determine columns being encoded
if columns is None:
- data_to_encode = data.select_dtypes(
- include=dtypes_to_encode)
+ data_to_encode = data.select_dtypes(include=dtypes_to_encode)
else:
data_to_encode = data[columns]
# validate prefixes and separator to avoid silently dropping cols
def check_len(item, name):
- len_msg = ("Length of '{name}' ({len_item}) did not match the "
- "length of the columns being encoded ({len_enc}).")
+ len_msg = (
+ "Length of '{name}' ({len_item}) did not match the "
+ "length of the columns being encoded ({len_enc})."
+ )
if is_list_like(item):
if not len(item) == data_to_encode.shape[1]:
- len_msg = len_msg.format(name=name, len_item=len(item),
- len_enc=data_to_encode.shape[1])
+ len_msg = len_msg.format(
+ name=name, len_item=len(item), len_enc=data_to_encode.shape[1]
+ )
raise ValueError(len_msg)
- check_len(prefix, 'prefix')
- check_len(prefix_sep, 'prefix_sep')
+ check_len(prefix, "prefix")
+ check_len(prefix_sep, "prefix_sep")
if isinstance(prefix, str):
prefix = cycle([prefix])
@@ -850,25 +909,43 @@ def check_len(item, name):
# columns to prepend to result.
with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
- for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix,
- prefix_sep):
+ for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix, prefix_sep):
# col is (column_name, column), use just column data here
- dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep,
- dummy_na=dummy_na, sparse=sparse,
- drop_first=drop_first, dtype=dtype)
+ dummy = _get_dummies_1d(
+ col[1],
+ prefix=pre,
+ prefix_sep=sep,
+ dummy_na=dummy_na,
+ sparse=sparse,
+ drop_first=drop_first,
+ dtype=dtype,
+ )
with_dummies.append(dummy)
result = concat(with_dummies, axis=1)
else:
- result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
- sparse=sparse,
- drop_first=drop_first,
- dtype=dtype)
+ result = _get_dummies_1d(
+ data,
+ prefix,
+ prefix_sep,
+ dummy_na,
+ sparse=sparse,
+ drop_first=drop_first,
+ dtype=dtype,
+ )
return result
-def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
- sparse=False, drop_first=False, dtype=None):
+def _get_dummies_1d(
+ data,
+ prefix,
+ prefix_sep="_",
+ dummy_na=False,
+ sparse=False,
+ drop_first=False,
+ dtype=None,
+):
from pandas.core.reshape.concat import concat
+
# Series avoids inconsistent NaN handling
codes, levels = _factorize_from_iterable(Series(data))
@@ -907,13 +984,10 @@ def get_empty_frame(data):
# PY2 embedded unicode, gh-22084
def _make_col_name(prefix, prefix_sep, level):
- fstr = '{prefix}{prefix_sep}{level}'
- return fstr.format(prefix=prefix,
- prefix_sep=prefix_sep,
- level=level)
+ fstr = "{prefix}{prefix_sep}{level}"
+ return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level)
- dummy_cols = [_make_col_name(prefix, prefix_sep, level)
- for level in levels]
+ dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels]
if isinstance(data, Series):
index = data.index
@@ -945,10 +1019,12 @@ def _make_col_name(prefix, prefix_sep, level):
sp_indices = sp_indices[1:]
dummy_cols = dummy_cols[1:]
for col, ixs in zip(dummy_cols, sp_indices):
- sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
- sparse_index=IntIndex(N, ixs),
- fill_value=fill_value,
- dtype=dtype)
+ sarr = SparseArray(
+ np.ones(len(ixs), dtype=dtype),
+ sparse_index=IntIndex(N, ixs),
+ fill_value=fill_value,
+ dtype=dtype,
+ )
sparse_series.append(Series(data=sarr, index=index, name=col))
out = concat(sparse_series, axis=1, copy=False)
@@ -968,7 +1044,7 @@ def _make_col_name(prefix, prefix_sep, level):
return DataFrame(dummy_mat, index=index, columns=dummy_cols)
-def make_axis_dummies(frame, axis='minor', transform=None):
+def make_axis_dummies(frame, axis="minor", transform=None):
"""
Construct 1-0 dummy variables corresponding to designated axis
labels
@@ -989,7 +1065,7 @@ def make_axis_dummies(frame, axis='minor', transform=None):
dummies : DataFrame
Column names taken from chosen axis
"""
- numbers = {'major': 0, 'minor': 1}
+ numbers = {"major": 0, "minor": 1}
num = numbers.get(axis, axis)
items = frame.index.levels[num]
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
index 96124331e43ef..0446f53345671 100644
--- a/pandas/core/reshape/tile.py
+++ b/pandas/core/reshape/tile.py
@@ -8,20 +8,43 @@
from pandas._libs.lib import infer_dtype
from pandas.core.dtypes.common import (
- _NS_DTYPE, ensure_int64, is_categorical_dtype, is_datetime64_dtype,
- is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer,
- is_scalar, is_timedelta64_dtype)
+ _NS_DTYPE,
+ ensure_int64,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_datetime_or_timedelta_dtype,
+ is_integer,
+ is_scalar,
+ is_timedelta64_dtype,
+)
from pandas.core.dtypes.missing import isna
from pandas import (
- Categorical, Index, Interval, IntervalIndex, Series, Timedelta, Timestamp,
- to_datetime, to_timedelta)
+ Categorical,
+ Index,
+ Interval,
+ IntervalIndex,
+ Series,
+ Timedelta,
+ Timestamp,
+ to_datetime,
+ to_timedelta,
+)
import pandas.core.algorithms as algos
import pandas.core.nanops as nanops
-def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
- include_lowest=False, duplicates='raise'):
+def cut(
+ x,
+ bins,
+ right=True,
+ labels=None,
+ retbins=False,
+ precision=3,
+ include_lowest=False,
+ duplicates="raise",
+):
"""
Bin values into discrete intervals.
@@ -199,18 +222,19 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
sz = x.size
if sz == 0:
- raise ValueError('Cannot cut empty array')
+ raise ValueError("Cannot cut empty array")
rng = (nanops.nanmin(x), nanops.nanmax(x))
mn, mx = [mi + 0.0 for mi in rng]
if np.isinf(mn) or np.isinf(mx):
# GH 24314
- raise ValueError('cannot specify integer `bins` when input data '
- 'contains infinity')
+ raise ValueError(
+ "cannot specify integer `bins` when input data " "contains infinity"
+ )
elif mn == mx: # adjust end points before binning
- mn -= .001 * abs(mn) if mn != 0 else .001
- mx += .001 * abs(mx) if mx != 0 else .001
+ mn -= 0.001 * abs(mn) if mn != 0 else 0.001
+ mx += 0.001 * abs(mx) if mx != 0 else 0.001
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
else: # adjust end points after binning
bins = np.linspace(mn, mx, bins + 1, endpoint=True)
@@ -222,7 +246,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
elif isinstance(bins, IntervalIndex):
if bins.is_overlapping:
- raise ValueError('Overlapping IntervalIndex is not accepted.')
+ raise ValueError("Overlapping IntervalIndex is not accepted.")
else:
if is_datetime64tz_dtype(bins):
@@ -232,20 +256,26 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
bins = _convert_bin_to_numeric_type(bins, dtype)
# GH 26045: cast to float64 to avoid an overflow
- if (np.diff(bins.astype('float64')) < 0).any():
- raise ValueError('bins must increase monotonically.')
-
- fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels,
- precision=precision,
- include_lowest=include_lowest,
- dtype=dtype,
- duplicates=duplicates)
-
- return _postprocess_for_cut(fac, bins, retbins, x_is_series,
- series_index, name, dtype)
-
-
-def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
+ if (np.diff(bins.astype("float64")) < 0).any():
+ raise ValueError("bins must increase monotonically.")
+
+ fac, bins = _bins_to_cuts(
+ x,
+ bins,
+ right=right,
+ labels=labels,
+ precision=precision,
+ include_lowest=include_lowest,
+ dtype=dtype,
+ duplicates=duplicates,
+ )
+
+ return _postprocess_for_cut(
+ fac, bins, retbins, x_is_series, series_index, name, dtype
+ )
+
+
+def qcut(x, q, labels=None, retbins=False, precision=3, duplicates="raise"):
"""
Quantile-based discretization function. Discretize variable into
equal-sized buckets based on rank or based on sample quantiles. For example
@@ -309,21 +339,37 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
else:
quantiles = q
bins = algos.quantile(x, quantiles)
- fac, bins = _bins_to_cuts(x, bins, labels=labels,
- precision=precision, include_lowest=True,
- dtype=dtype, duplicates=duplicates)
-
- return _postprocess_for_cut(fac, bins, retbins, x_is_series,
- series_index, name, dtype)
-
-
-def _bins_to_cuts(x, bins, right=True, labels=None,
- precision=3, include_lowest=False,
- dtype=None, duplicates='raise'):
-
- if duplicates not in ['raise', 'drop']:
- raise ValueError("invalid value for 'duplicates' parameter, "
- "valid options are: raise, drop")
+ fac, bins = _bins_to_cuts(
+ x,
+ bins,
+ labels=labels,
+ precision=precision,
+ include_lowest=True,
+ dtype=dtype,
+ duplicates=duplicates,
+ )
+
+ return _postprocess_for_cut(
+ fac, bins, retbins, x_is_series, series_index, name, dtype
+ )
+
+
+def _bins_to_cuts(
+ x,
+ bins,
+ right=True,
+ labels=None,
+ precision=3,
+ include_lowest=False,
+ dtype=None,
+ duplicates="raise",
+):
+
+ if duplicates not in ["raise", "drop"]:
+ raise ValueError(
+ "invalid value for 'duplicates' parameter, "
+ "valid options are: raise, drop"
+ )
if isinstance(bins, IntervalIndex):
# we have a fast-path here
@@ -334,14 +380,16 @@ def _bins_to_cuts(x, bins, right=True, labels=None,
unique_bins = algos.unique(bins)
if len(unique_bins) < len(bins) and len(bins) != 2:
- if duplicates == 'raise':
- raise ValueError("Bin edges must be unique: {bins!r}.\nYou "
- "can drop duplicate edges by setting "
- "the 'duplicates' kwarg".format(bins=bins))
+ if duplicates == "raise":
+ raise ValueError(
+ "Bin edges must be unique: {bins!r}.\nYou "
+ "can drop duplicate edges by setting "
+ "the 'duplicates' kwarg".format(bins=bins)
+ )
else:
bins = unique_bins
- side = 'left' if right else 'right'
+ side = "left" if right else "right"
ids = ensure_int64(bins.searchsorted(x, side=side))
if include_lowest:
@@ -352,13 +400,14 @@ def _bins_to_cuts(x, bins, right=True, labels=None,
if labels is not False:
if labels is None:
- labels = _format_labels(bins, precision, right=right,
- include_lowest=include_lowest,
- dtype=dtype)
+ labels = _format_labels(
+ bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
+ )
else:
if len(labels) != len(bins) - 1:
- raise ValueError('Bin labels must be one fewer than '
- 'the number of bin edges')
+ raise ValueError(
+ "Bin labels must be one fewer than " "the number of bin edges"
+ )
if not is_categorical_dtype(labels):
labels = Categorical(labels, categories=labels, ordered=True)
@@ -386,10 +435,10 @@ def _coerce_to_type(x):
dtype = x.dtype
elif is_datetime64_dtype(x):
x = to_datetime(x)
- dtype = np.dtype('datetime64[ns]')
+ dtype = np.dtype("datetime64[ns]")
elif is_timedelta64_dtype(x):
x = to_timedelta(x)
- dtype = np.dtype('timedelta64[ns]')
+ dtype = np.dtype("timedelta64[ns]")
if dtype is not None:
# GH 19768: force NaT to NaN during integer conversion
@@ -414,12 +463,12 @@ def _convert_bin_to_numeric_type(bins, dtype):
"""
bins_dtype = infer_dtype(bins, skipna=False)
if is_timedelta64_dtype(dtype):
- if bins_dtype in ['timedelta', 'timedelta64']:
+ if bins_dtype in ["timedelta", "timedelta64"]:
bins = to_timedelta(bins).view(np.int64)
else:
raise ValueError("bins must be of timedelta64 dtype")
elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
- if bins_dtype in ['datetime', 'datetime64']:
+ if bins_dtype in ["datetime", "datetime64"]:
bins = to_datetime(bins).view(np.int64)
else:
raise ValueError("bins must be of datetime64 dtype")
@@ -443,28 +492,26 @@ def _convert_bin_to_datelike_type(bins, dtype):
datelike
"""
if is_datetime64tz_dtype(dtype):
- bins = to_datetime(bins.astype(np.int64),
- utc=True).tz_convert(dtype.tz)
+ bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)
elif is_datetime_or_timedelta_dtype(dtype):
bins = Index(bins.astype(np.int64), dtype=dtype)
return bins
-def _format_labels(bins, precision, right=True,
- include_lowest=False, dtype=None):
+def _format_labels(bins, precision, right=True, include_lowest=False, dtype=None):
""" based on the dtype, return our labels """
- closed = 'right' if right else 'left'
+ closed = "right" if right else "left"
if is_datetime64tz_dtype(dtype):
formatter = partial(Timestamp, tz=dtype.tz)
- adjust = lambda x: x - Timedelta('1ns')
+ adjust = lambda x: x - Timedelta("1ns")
elif is_datetime64_dtype(dtype):
formatter = Timestamp
- adjust = lambda x: x - Timedelta('1ns')
+ adjust = lambda x: x - Timedelta("1ns")
elif is_timedelta64_dtype(dtype):
formatter = Timedelta
- adjust = lambda x: x - Timedelta('1ns')
+ adjust = lambda x: x - Timedelta("1ns")
else:
precision = _infer_precision(precision, bins)
formatter = lambda x: _round_frac(x, precision)
@@ -478,7 +525,7 @@ def _format_labels(bins, precision, right=True,
# account that we are all right closed
v = adjust(labels[0].left)
- i = IntervalIndex([Interval(v, labels[0].right, closed='right')])
+ i = IntervalIndex([Interval(v, labels[0].right, closed="right")])
labels = i.append(labels[1:])
return labels
@@ -500,7 +547,7 @@ def _preprocess_for_cut(x):
# Check that the passed array is a Pandas or Numpy object
# We don't want to strip away a Pandas data-type here (e.g. datetimetz)
- ndim = getattr(x, 'ndim', None)
+ ndim = getattr(x, "ndim", None)
if ndim is None:
x = np.asarray(x)
if x.ndim != 1:
@@ -509,8 +556,7 @@ def _preprocess_for_cut(x):
return x_is_series, series_index, name, x
-def _postprocess_for_cut(fac, bins, retbins, x_is_series,
- series_index, name, dtype):
+def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name, dtype):
"""
handles post processing for the cut method where
we combine the index information if the originally passed
diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py
index 9975fe65ac0fe..044e058904dce 100644
--- a/pandas/core/reshape/util.py
+++ b/pandas/core/reshape/util.py
@@ -51,6 +51,9 @@ def cartesian_product(X):
# if any factor is empty, the cartesian product is empty
b = np.zeros_like(cumprodX)
- return [np.tile(np.repeat(np.asarray(com.values_from_object(x)), b[i]),
- np.product(a[i]))
- for i, x in enumerate(X)]
+ return [
+ np.tile(
+ np.repeat(np.asarray(com.values_from_object(x)), b[i]), np.product(a[i])
+ )
+ for i, x in enumerate(X)
+ ]
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 13966d4551b54..b3a7f38aef8ef 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -19,15 +19,38 @@
from pandas.util._validators import validate_bool_kwarg
from pandas.core.dtypes.common import (
- _is_unorderable_exception, ensure_platform_int, is_bool, is_categorical,
- is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
- is_extension_array_dtype, is_extension_type, is_hashable, is_integer,
- is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype)
+ _is_unorderable_exception,
+ ensure_platform_int,
+ is_bool,
+ is_categorical,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetimelike,
+ is_dict_like,
+ is_extension_array_dtype,
+ is_extension_type,
+ is_hashable,
+ is_integer,
+ is_iterator,
+ is_list_like,
+ is_scalar,
+ is_string_like,
+ is_timedelta64_dtype,
+)
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, ABCSeries,
- ABCSparseArray, ABCSparseSeries)
+ ABCDataFrame,
+ ABCDatetimeArray,
+ ABCDatetimeIndex,
+ ABCSeries,
+ ABCSparseArray,
+ ABCSparseSeries,
+)
from pandas.core.dtypes.missing import (
- isna, na_value_for_dtype, notna, remove_na_arraylike)
+ isna,
+ na_value_for_dtype,
+ notna,
+ remove_na_arraylike,
+)
import pandas as pd
from pandas.core import algorithms, base, generic, nanops, ops
@@ -37,7 +60,12 @@
from pandas.core.arrays.sparse import SparseAccessor
import pandas.core.common as com
from pandas.core.index import (
- Float64Index, Index, InvalidIndexError, MultiIndex, ensure_index)
+ Float64Index,
+ Index,
+ InvalidIndexError,
+ MultiIndex,
+ ensure_index,
+)
from pandas.core.indexes.accessors import CombinedDatetimelikeProperties
import pandas.core.indexes.base as ibase
from pandas.core.indexes.datetimes import DatetimeIndex
@@ -52,17 +80,24 @@
import pandas.io.formats.format as fmt
import pandas.plotting
-__all__ = ['Series']
+__all__ = ["Series"]
_shared_doc_kwargs = dict(
- axes='index', klass='Series', axes_single_arg="{0 or 'index'}",
+ axes="index",
+ klass="Series",
+ axes_single_arg="{0 or 'index'}",
axis="""axis : {0 or 'index'}
Parameter needed for compatibility with DataFrame.""",
inplace="""inplace : boolean, default False
If True, performs operation inplace and returns None.""",
- unique='np.ndarray', duplicated='Series',
- optional_by='', optional_mapper='', optional_labels='', optional_axis='',
- versionadded_to_excel='\n .. versionadded:: 0.20.0\n')
+ unique="np.ndarray",
+ duplicated="Series",
+ optional_by="",
+ optional_mapper="",
+ optional_labels="",
+ optional_axis="",
+ versionadded_to_excel="\n .. versionadded:: 0.20.0\n",
+)
# see gh-16971
@@ -74,8 +109,11 @@ def remove_na(arr):
Use s[s.notnull()] instead.
"""
- warnings.warn("remove_na is deprecated and is a private "
- "function. Do not use.", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "remove_na is deprecated and is a private " "function. Do not use.",
+ FutureWarning,
+ stacklevel=2,
+ )
return remove_na_arraylike(arr)
@@ -87,12 +125,12 @@ def _coerce_method(converter):
def wrapper(self):
if len(self) == 1:
return converter(self.iloc[0])
- raise TypeError("cannot convert the series to "
- "{0}".format(str(converter)))
+ raise TypeError("cannot convert the series to " "{0}".format(str(converter)))
wrapper.__name__ = "__{name}__".format(name=converter.__name__)
return wrapper
+
# ----------------------------------------------------------------------
# Series class
@@ -133,23 +171,26 @@ class Series(base.IndexOpsMixin, generic.NDFrame):
copy : bool, default False
Copy input data.
"""
- _metadata = ['name']
- _accessors = {'dt', 'cat', 'str', 'sparse'}
+
+ _metadata = ["name"]
+ _accessors = {"dt", "cat", "str", "sparse"}
# tolist is not actually deprecated, just suppressed in the __dir__
_deprecations = generic.NDFrame._deprecations | frozenset(
- ['asobject', 'reshape', 'get_value', 'set_value',
- 'valid', 'tolist'])
+ ["asobject", "reshape", "get_value", "set_value", "valid", "tolist"]
+ )
# Override cache_readonly bc Series is mutable
- hasnans = property(base.IndexOpsMixin.hasnans.func,
- doc=base.IndexOpsMixin.hasnans.__doc__)
+ hasnans = property(
+ base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__
+ )
_data = None # type: SingleBlockManager
# ----------------------------------------------------------------------
# Constructors
- def __init__(self, data=None, index=None, dtype=None, name=None,
- copy=False, fastpath=False):
+ def __init__(
+ self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False
+ ):
# we are called internally, so short-circuit
if fastpath:
@@ -172,15 +213,19 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
if dtype is not None:
# GH 26336: explicitly handle 'category' to avoid warning
# TODO: Remove after CategoricalDtype defaults to ordered=False
- if (isinstance(dtype, str) and dtype == 'category' and
- is_categorical(data)):
+ if (
+ isinstance(dtype, str)
+ and dtype == "category"
+ and is_categorical(data)
+ ):
dtype = data.dtype
dtype = self._validate_dtype(dtype)
if isinstance(data, MultiIndex):
- raise NotImplementedError("initializing a Series from a "
- "MultiIndex is not supported")
+ raise NotImplementedError(
+ "initializing a Series from a " "MultiIndex is not supported"
+ )
elif isinstance(data, Index):
if name is None:
name = data.name
@@ -191,8 +236,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
else:
# need to copy to avoid aliasing issues
data = data._values.copy()
- if (isinstance(data, ABCDatetimeIndex) and
- data.tz is not None):
+ if isinstance(data, ABCDatetimeIndex) and data.tz is not None:
# GH#24096 need copy to be deep for datetime64tz case
# TODO: See if we can avoid these copies
data = data._values.copy(deep=True)
@@ -218,16 +262,19 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
elif not data.index.equals(index) or copy:
# GH#19275 SingleBlockManager input should only be called
# internally
- raise AssertionError('Cannot pass both SingleBlockManager '
- '`data` argument and a different '
- '`index` argument. `copy` must '
- 'be False.')
+ raise AssertionError(
+ "Cannot pass both SingleBlockManager "
+ "`data` argument and a different "
+ "`index` argument. `copy` must "
+ "be False."
+ )
elif is_extension_array_dtype(data):
pass
elif isinstance(data, (set, frozenset)):
- raise TypeError("{0!r} type is unordered"
- "".format(data.__class__.__name__))
+ raise TypeError(
+ "{0!r} type is unordered" "".format(data.__class__.__name__)
+ )
elif isinstance(data, ABCSparseArray):
# handle sparse passed here (and force conversion)
data = data.to_dense()
@@ -245,22 +292,20 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
try:
if len(index) != len(data):
raise ValueError(
- 'Length of passed values is {val}, '
- 'index implies {ind}'
- .format(val=len(data), ind=len(index)))
+ "Length of passed values is {val}, "
+ "index implies {ind}".format(val=len(data), ind=len(index))
+ )
except TypeError:
pass
# create/copy the manager
if isinstance(data, SingleBlockManager):
if dtype is not None:
- data = data.astype(dtype=dtype, errors='ignore',
- copy=copy)
+ data = data.astype(dtype=dtype, errors="ignore", copy=copy)
elif copy:
data = data.copy()
else:
- data = sanitize_array(data, index, dtype, copy,
- raise_cast_failure=True)
+ data = sanitize_array(data, index, dtype, copy, raise_cast_failure=True)
data = SingleBlockManager(data, index, fastpath=True)
@@ -317,8 +362,9 @@ def _init_dict(self, data, index=None, dtype=None):
return s._data, s.index
@classmethod
- def from_array(cls, arr, index=None, name=None, dtype=None, copy=False,
- fastpath=False):
+ def from_array(
+ cls, arr, index=None, name=None, dtype=None, copy=False, fastpath=False
+ ):
"""
Construct Series from array.
@@ -330,14 +376,20 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False,
Series
Constructed Series.
"""
- warnings.warn("'from_array' is deprecated and will be removed in a "
- "future version. Please use the pd.Series(..) "
- "constructor instead.", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "'from_array' is deprecated and will be removed in a "
+ "future version. Please use the pd.Series(..) "
+ "constructor instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
if isinstance(arr, ABCSparseArray):
from pandas.core.sparse.series import SparseSeries
+
cls = SparseSeries
- return cls(arr, index=index, name=name, dtype=dtype,
- copy=copy, fastpath=fastpath)
+ return cls(
+ arr, index=index, name=name, dtype=dtype, copy=copy, fastpath=fastpath
+ )
# ----------------------------------------------------------------------
@@ -348,6 +400,7 @@ def _constructor(self):
@property
def _constructor_expanddim(self):
from pandas.core.frame import DataFrame
+
return DataFrame
# types
@@ -367,8 +420,7 @@ def _set_axis(self, axis, labels, fastpath=False):
is_all_dates = labels.is_all_dates
if is_all_dates:
- if not isinstance(labels,
- (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
+ if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
try:
labels = DatetimeIndex(labels)
# need to set here because we changed the index
@@ -381,15 +433,15 @@ def _set_axis(self, axis, labels, fastpath=False):
self._set_subtyp(is_all_dates)
- object.__setattr__(self, '_index', labels)
+ object.__setattr__(self, "_index", labels)
if not fastpath:
self._data.set_axis(axis, labels)
def _set_subtyp(self, is_all_dates):
if is_all_dates:
- object.__setattr__(self, '_subtyp', 'time_series')
+ object.__setattr__(self, "_subtyp", "time_series")
else:
- object.__setattr__(self, '_subtyp', 'series')
+ object.__setattr__(self, "_subtyp", "series")
def _update_inplace(self, result, **kwargs):
# we want to call the generic version and not the IndexOpsMixin
@@ -405,8 +457,8 @@ def name(self):
@name.setter
def name(self, value):
if value is not None and not is_hashable(value):
- raise TypeError('Series.name must be a hashable type')
- object.__setattr__(self, '_name', value)
+ raise TypeError("Series.name must be a hashable type")
+ object.__setattr__(self, "_name", value)
# ndarray compatibility
@property
@@ -431,10 +483,13 @@ def ftype(self):
.. deprecated:: 0.25.0
Use :func:`dtype` instead.
"""
- warnings.warn("Series.ftype is deprecated and will "
- "be removed in a future version. "
- "Use Series.dtype instead.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "Series.ftype is deprecated and will "
+ "be removed in a future version. "
+ "Use Series.dtype instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._data.ftype
@@ -446,10 +501,13 @@ def ftypes(self):
.. deprecated:: 0.25.0
Use :func:`dtypes` instead.
"""
- warnings.warn("Series.ftypes is deprecated and will "
- "be removed in a future version. "
- "Use Series.dtype instead.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "Series.ftypes is deprecated and will "
+ "be removed in a future version. "
+ "Use Series.dtype instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._data.ftype
@@ -524,7 +582,9 @@ def get_values(self):
warnings.warn(
"The 'get_values' method is deprecated and will be removed in a "
"future version. Use '.to_numpy()' or '.array' instead.",
- FutureWarning, stacklevel=2)
+ FutureWarning,
+ stacklevel=2,
+ )
return self._internal_get_values()
def _internal_get_values(self):
@@ -541,12 +601,15 @@ def asobject(self):
*this is an internal non-public method*
"""
- warnings.warn("'asobject' is deprecated. Use 'astype(object)'"
- " instead", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "'asobject' is deprecated. Use 'astype(object)'" " instead",
+ FutureWarning,
+ stacklevel=2,
+ )
return self.astype(object).values
# ops
- def ravel(self, order='C'):
+ def ravel(self, order="C"):
"""
Return the flattened underlying data as an ndarray.
@@ -576,9 +639,11 @@ def compress(self, condition, *args, **kwargs):
--------
numpy.ndarray.compress
"""
- msg = ("Series.compress(condition) is deprecated. "
- "Use 'Series[condition]' or "
- "'np.asarray(series).compress(condition)' instead.")
+ msg = (
+ "Series.compress(condition) is deprecated. "
+ "Use 'Series[condition]' or "
+ "'np.asarray(series).compress(condition)' instead."
+ )
warnings.warn(msg, FutureWarning, stacklevel=2)
nv.validate_compress(args, kwargs)
return self[condition]
@@ -624,9 +689,11 @@ def nonzero(self):
d 4
dtype: int64
"""
- msg = ("Series.nonzero() is deprecated "
- "and will be removed in a future version."
- "Use Series.to_numpy().nonzero() instead")
+ msg = (
+ "Series.nonzero() is deprecated "
+ "and will be removed in a future version."
+ "Use Series.to_numpy().nonzero() instead"
+ )
warnings.warn(msg, FutureWarning, stacklevel=2)
return self._values.nonzero()
@@ -640,8 +707,11 @@ def put(self, *args, **kwargs):
--------
numpy.ndarray.put
"""
- warnings.warn('`put` has been deprecated and will be removed in a'
- 'future version.', FutureWarning, stacklevel=2)
+ warnings.warn(
+ "`put` has been deprecated and will be removed in a" "future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
self._values.put(*args, **kwargs)
def __len__(self):
@@ -716,27 +786,26 @@ def view(self, dtype=None):
4 2
dtype: int8
"""
- return self._constructor(self._values.view(dtype),
- index=self.index).__finalize__(self)
+ return self._constructor(
+ self._values.view(dtype), index=self.index
+ ).__finalize__(self)
# ----------------------------------------------------------------------
# NDArray Compat
_HANDLED_TYPES = (Index, ExtensionArray, np.ndarray)
def __array_ufunc__(
- self,
- ufunc: Callable,
- method: str,
- *inputs: Any,
- **kwargs: Any
+ self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
):
# TODO: handle DataFrame
from pandas.core.internals.construction import extract_array
+
cls = type(self)
# for binary ops, use our custom dunder methods
result = ops.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs)
+ self, ufunc, method, *inputs, **kwargs
+ )
if result is not NotImplemented:
return result
@@ -745,19 +814,19 @@ def __array_ufunc__(
for item in inputs:
higher_priority = (
- hasattr(item, '__array_priority__') and
- item.__array_priority__ > self.__array_priority__
+ hasattr(item, "__array_priority__")
+ and item.__array_priority__ > self.__array_priority__
)
has_array_ufunc = (
- hasattr(item, '__array_ufunc__') and
- type(item).__array_ufunc__ not in no_defer and
- not isinstance(item, self._HANDLED_TYPES)
+ hasattr(item, "__array_ufunc__")
+ and type(item).__array_ufunc__ not in no_defer
+ and not isinstance(item, self._HANDLED_TYPES)
)
if higher_priority or has_array_ufunc:
return NotImplemented
# align all the inputs.
- names = [getattr(x, 'name') for x in inputs if hasattr(x, 'name')]
+ names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
types = tuple(type(x) for x in inputs)
# TODO: dataframe
alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)]
@@ -770,8 +839,10 @@ def __array_ufunc__(
index = alignable[0].index
for s in alignable[1:]:
index |= s.index
- inputs = tuple(x.reindex(index) if issubclass(t, Series) else x
- for x, t in zip(inputs, types))
+ inputs = tuple(
+ x.reindex(index) if issubclass(t, Series) else x
+ for x, t in zip(inputs, types)
+ )
else:
index = self.index
@@ -788,7 +859,7 @@ def construct_return(result):
return result
elif result.ndim > 1:
# e.g. np.subtract.outer
- if method == 'outer':
+ if method == "outer":
msg = (
"outer method for ufunc {} is not implemented on "
"pandas objects. Returning an ndarray, but in the "
@@ -796,18 +867,14 @@ def construct_return(result):
"Consider explicitly converting the Series "
"to an array with '.array' first."
)
- warnings.warn(msg.format(ufunc), FutureWarning,
- stacklevel=3)
+ warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=3)
return result
- return self._constructor(result,
- index=index,
- name=name,
- copy=False)
+ return self._constructor(result, index=index, name=name, copy=False)
if type(result) is tuple:
# multiple return values
return tuple(construct_return(x) for x in result)
- elif method == 'at':
+ elif method == "at":
# no return value
return None
else:
@@ -860,8 +927,11 @@ def __array__(self, dtype=None):
array(['1999-12-31T23:00:00.000000000', ...],
dtype='datetime64[ns]')
"""
- if (dtype is None and isinstance(self.array, ABCDatetimeArray)
- and getattr(self.dtype, 'tz', None)):
+ if (
+ dtype is None
+ and isinstance(self.array, ABCDatetimeArray)
+ and getattr(self.dtype, "tz", None)
+ ):
msg = (
"Converting timezone-aware DatetimeArray to timezone-naive "
"ndarray with 'datetime64[ns]' dtype. In the future, this "
@@ -871,7 +941,7 @@ def __array__(self, dtype=None):
"To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'."
)
warnings.warn(msg, FutureWarning, stacklevel=3)
- dtype = 'M8[ns]'
+ dtype = "M8[ns]"
return np.asarray(self.array, dtype)
# ----------------------------------------------------------------------
@@ -884,8 +954,11 @@ def real(self):
.. deprecated 0.25.0
"""
- warnings.warn("`real` has be deprecated and will be removed in a "
- "future verison", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "`real` has be deprecated and will be removed in a " "future verison",
+ FutureWarning,
+ stacklevel=2,
+ )
return self.values.real
@real.setter
@@ -899,8 +972,11 @@ def imag(self):
.. deprecated 0.25.0
"""
- warnings.warn("`imag` has be deprecated and will be removed in a "
- "future verison", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "`imag` has be deprecated and will be removed in a " "future verison",
+ FutureWarning,
+ stacklevel=2,
+ )
return self.values.imag
@imag.setter
@@ -916,8 +992,8 @@ def imag(self, v):
def _unpickle_series_compat(self, state):
if isinstance(state, dict):
- self._data = state['_data']
- self.name = state['name']
+ self._data = state["_data"]
+ self.name = state["name"]
self.index = self._data.index
elif isinstance(state, tuple):
@@ -975,7 +1051,7 @@ def _ixs(self, i, axis=0):
raise
except Exception:
if isinstance(i, slice):
- indexer = self.index._convert_slice_indexer(i, kind='iloc')
+ indexer = self.index._convert_slice_indexer(i, kind="iloc")
return self._get_values(indexer)
else:
label = self.index[i]
@@ -989,8 +1065,7 @@ def _is_mixed_type(self):
return False
def _slice(self, slobj, axis=0, kind=None):
- slobj = self.index._convert_slice_indexer(slobj,
- kind=kind or 'getitem')
+ slobj = self.index._convert_slice_indexer(slobj, kind=kind or "getitem")
return self._get_values(slobj)
def __getitem__(self, key):
@@ -1006,8 +1081,8 @@ def __getitem__(self, key):
try:
if not is_scalar(self.index.get_loc(key)):
result = self._constructor(
- result, index=[key] * len(result),
- dtype=self.dtype).__finalize__(self)
+ result, index=[key] * len(result), dtype=self.dtype
+ ).__finalize__(self)
except KeyError:
pass
return result
@@ -1024,8 +1099,7 @@ def __getitem__(self, key):
else:
# we can try to coerce the indexer (or this will raise)
- new_key = self.index._convert_scalar_indexer(key,
- kind='getitem')
+ new_key = self.index._convert_scalar_indexer(key, kind="getitem")
if type(new_key) != type(key):
return self.__getitem__(new_key)
raise
@@ -1044,11 +1118,13 @@ def __getitem__(self, key):
def _get_with(self, key):
# other: fancy integer or otherwise
if isinstance(key, slice):
- indexer = self.index._convert_slice_indexer(key, kind='getitem')
+ indexer = self.index._convert_slice_indexer(key, kind="getitem")
return self._get_values(indexer)
elif isinstance(key, ABCDataFrame):
- raise TypeError('Indexing a Series with DataFrame is not '
- 'supported, use the appropriate DataFrame column')
+ raise TypeError(
+ "Indexing a Series with DataFrame is not "
+ "supported, use the appropriate DataFrame column"
+ )
elif isinstance(key, tuple):
try:
return self._get_values_tuple(key)
@@ -1068,12 +1144,12 @@ def _get_with(self, key):
else:
key_type = lib.infer_dtype(key, skipna=False)
- if key_type == 'integer':
+ if key_type == "integer":
if self.index.is_integer() or self.index.is_floating():
return self.loc[key]
else:
return self._get_values(key)
- elif key_type == 'boolean':
+ elif key_type == "boolean":
return self._get_values(key)
try:
@@ -1096,17 +1172,19 @@ def _get_values_tuple(self, key):
return self._get_values(key)
if not isinstance(self.index, MultiIndex):
- raise ValueError('Can only tuple-index with a MultiIndex')
+ raise ValueError("Can only tuple-index with a MultiIndex")
# If key is contained, would have returned by now
indexer, new_index = self.index.get_loc_level(key)
- return self._constructor(self._values[indexer],
- index=new_index).__finalize__(self)
+ return self._constructor(self._values[indexer], index=new_index).__finalize__(
+ self
+ )
def _get_values(self, indexer):
try:
- return self._constructor(self._data.get_slice(indexer),
- fastpath=True).__finalize__(self)
+ return self._constructor(
+ self._data.get_slice(indexer), fastpath=True
+ ).__finalize__(self)
except Exception:
return self._values[indexer]
@@ -1121,8 +1199,7 @@ def setitem(key, value):
raise
except (KeyError, ValueError):
values = self._values
- if (is_integer(key) and
- not self.index.inferred_type == 'integer'):
+ if is_integer(key) and not self.index.inferred_type == "integer":
values[key] = value
return
@@ -1137,8 +1214,7 @@ def setitem(key, value):
value = iNaT
try:
- self.index._engine.set_value(self._values, key,
- value)
+ self.index._engine.set_value(self._values, key, value)
return
except TypeError:
pass
@@ -1147,8 +1223,7 @@ def setitem(key, value):
return
except TypeError as e:
- if (isinstance(key, tuple) and
- not isinstance(self.index, MultiIndex)):
+ if isinstance(key, tuple) and not isinstance(self.index, MultiIndex):
raise ValueError("Can only tuple-index with a MultiIndex")
# python 3 type errors should be raised
@@ -1183,7 +1258,7 @@ def _set_with_engine(self, key, value):
def _set_with(self, key, value):
# other: fancy integer or otherwise
if isinstance(key, slice):
- indexer = self.index._convert_slice_indexer(key, kind='getitem')
+ indexer = self.index._convert_slice_indexer(key, kind="getitem")
return self._set_values(indexer, value)
else:
if isinstance(key, tuple):
@@ -1205,12 +1280,12 @@ def _set_with(self, key, value):
else:
key_type = lib.infer_dtype(key, skipna=False)
- if key_type == 'integer':
- if self.index.inferred_type == 'integer':
+ if key_type == "integer":
+ if self.index.inferred_type == "integer":
self._set_labels(key, value)
else:
return self._set_values(key, value)
- elif key_type == 'boolean':
+ elif key_type == "boolean":
self._set_values(key.astype(np.bool_), value)
else:
self._set_labels(key, value)
@@ -1223,7 +1298,7 @@ def _set_labels(self, key, value):
indexer = self.index.get_indexer(key)
mask = indexer == -1
if mask.any():
- raise ValueError('%s not contained in the index' % str(key[mask]))
+ raise ValueError("%s not contained in the index" % str(key[mask]))
self._set_values(indexer, value)
def _set_values(self, key, value):
@@ -1287,8 +1362,7 @@ def repeat(self, repeats, axis=None):
nv.validate_repeat(tuple(), dict(axis=axis))
new_index = self.index.repeat(repeats)
new_values = self._values.repeat(repeats)
- return self._constructor(new_values,
- index=new_index).__finalize__(self)
+ return self._constructor(new_values, index=new_index).__finalize__(self)
def get_value(self, label, takeable=False):
"""
@@ -1306,16 +1380,20 @@ def get_value(self, label, takeable=False):
-------
scalar value
"""
- warnings.warn("get_value is deprecated and will be removed "
- "in a future release. Please use "
- ".at[] or .iat[] accessors instead", FutureWarning,
- stacklevel=2)
+ warnings.warn(
+ "get_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._get_value(label, takeable=takeable)
def _get_value(self, label, takeable=False):
if takeable is True:
return com.maybe_box_datetimelike(self._values[label])
return self.index.get_value(self._values, label)
+
_get_value.__doc__ = get_value.__doc__
def set_value(self, label, value, takeable=False):
@@ -1342,10 +1420,13 @@ def set_value(self, label, value, takeable=False):
If label is contained, will be reference to calling Series,
otherwise a new object.
"""
- warnings.warn("set_value is deprecated and will be removed "
- "in a future release. Please use "
- ".at[] or .iat[] accessors instead", FutureWarning,
- stacklevel=2)
+ warnings.warn(
+ "set_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._set_value(label, value, takeable=takeable)
def _set_value(self, label, value, takeable=False):
@@ -1360,6 +1441,7 @@ def _set_value(self, label, value, takeable=False):
self.loc[label] = value
return self
+
_set_value.__doc__ = set_value.__doc__
def reset_index(self, level=None, drop=False, name=None, inplace=False):
@@ -1470,7 +1552,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False):
2 baz one 2
3 baz two 3
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if drop:
new_index = ibase.default_index(len(self))
if level is not None:
@@ -1485,11 +1567,13 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False):
# set name if it was passed, otherwise, keep the previous name
self.name = name or self.name
else:
- return self._constructor(self._values.copy(),
- index=new_index).__finalize__(self)
+ return self._constructor(
+ self._values.copy(), index=new_index
+ ).__finalize__(self)
elif inplace:
- raise TypeError('Cannot reset_index inplace on a Series '
- 'to create a DataFrame')
+ raise TypeError(
+ "Cannot reset_index inplace on a Series " "to create a DataFrame"
+ )
else:
df = self.to_frame(name)
return df.reset_index(level=level, drop=drop)
@@ -1503,22 +1587,43 @@ def __repr__(self):
"""
buf = StringIO("")
width, height = get_terminal_size()
- max_rows = (height if get_option("display.max_rows") == 0 else
- get_option("display.max_rows"))
- min_rows = (height if get_option("display.max_rows") == 0 else
- get_option("display.min_rows"))
+ max_rows = (
+ height
+ if get_option("display.max_rows") == 0
+ else get_option("display.max_rows")
+ )
+ min_rows = (
+ height
+ if get_option("display.max_rows") == 0
+ else get_option("display.min_rows")
+ )
show_dimensions = get_option("display.show_dimensions")
- self.to_string(buf=buf, name=self.name, dtype=self.dtype,
- min_rows=min_rows, max_rows=max_rows,
- length=show_dimensions)
+ self.to_string(
+ buf=buf,
+ name=self.name,
+ dtype=self.dtype,
+ min_rows=min_rows,
+ max_rows=max_rows,
+ length=show_dimensions,
+ )
result = buf.getvalue()
return result
- def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True,
- index=True, length=False, dtype=False, name=False,
- max_rows=None, min_rows=None):
+ def to_string(
+ self,
+ buf=None,
+ na_rep="NaN",
+ float_format=None,
+ header=True,
+ index=True,
+ length=False,
+ dtype=False,
+ name=False,
+ max_rows=None,
+ min_rows=None,
+ ):
"""
Render a string representation of the Series.
@@ -1554,19 +1659,27 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True,
String representation of Series if ``buf=None``, otherwise None.
"""
- formatter = fmt.SeriesFormatter(self, name=name, length=length,
- header=header, index=index,
- dtype=dtype, na_rep=na_rep,
- float_format=float_format,
- min_rows=min_rows,
- max_rows=max_rows)
+ formatter = fmt.SeriesFormatter(
+ self,
+ name=name,
+ length=length,
+ header=header,
+ index=index,
+ dtype=dtype,
+ na_rep=na_rep,
+ float_format=float_format,
+ min_rows=min_rows,
+ max_rows=max_rows,
+ )
result = formatter.to_string()
# catch contract violations
if not isinstance(result, str):
- raise AssertionError("result must be of type unicode, type"
- " of result is {0!r}"
- "".format(result.__class__.__name__))
+ raise AssertionError(
+ "result must be of type unicode, type"
+ " of result is {0!r}"
+ "".format(result.__class__.__name__)
+ )
if buf is None:
return result
@@ -1574,7 +1687,7 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True,
try:
buf.write(result)
except AttributeError:
- with open(buf, 'w') as f:
+ with open(buf, "w") as f:
f.write(result)
# ----------------------------------------------------------------------
@@ -1691,7 +1804,7 @@ def to_frame(self, name=None):
return df
- def to_sparse(self, kind='block', fill_value=None):
+ def to_sparse(self, kind="block", fill_value=None):
"""
Convert Series to SparseSeries.
@@ -1709,16 +1822,19 @@ def to_sparse(self, kind='block', fill_value=None):
Sparse representation of the Series.
"""
- warnings.warn("Series.to_sparse is deprecated and will be removed "
- "in a future version", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "Series.to_sparse is deprecated and will be removed " "in a future version",
+ FutureWarning,
+ stacklevel=2,
+ )
from pandas.core.sparse.series import SparseSeries
values = SparseArray(self, kind=kind, fill_value=fill_value)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="SparseSeries")
- return SparseSeries(
- values, index=self.index, name=self.name
- ).__finalize__(self)
+ return SparseSeries(values, index=self.index, name=self.name).__finalize__(
+ self
+ )
def _set_name(self, name, inplace=False):
"""
@@ -1730,7 +1846,7 @@ def _set_name(self, name, inplace=False):
inplace : bool
whether to modify `self` directly or return a copy
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
ser = self if inplace else self.copy()
ser.name = name
return ser
@@ -1777,8 +1893,7 @@ def count(self, level=None):
obs = level_codes[notna(self.values)]
out = np.bincount(obs, minlength=len(lev) or None)
- return self._constructor(out, index=lev,
- dtype='int64').__finalize__(self)
+ return self._constructor(out, index=lev, dtype="int64").__finalize__(self)
def mode(self, dropna=True):
"""
@@ -1865,7 +1980,7 @@ def unique(self):
result = super().unique()
return result
- def drop_duplicates(self, keep='first', inplace=False):
+ def drop_duplicates(self, keep="first", inplace=False):
"""
Return Series with duplicate values removed.
@@ -1939,7 +2054,7 @@ def drop_duplicates(self, keep='first', inplace=False):
"""
return super().drop_duplicates(keep=keep, inplace=inplace)
- def duplicated(self, keep='first'):
+ def duplicated(self, keep="first"):
"""
Indicate duplicate Series values.
@@ -2158,24 +2273,32 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs):
# ndarray compat
argmin = deprecate(
- 'argmin', idxmin, '0.21.0',
- msg=dedent("""
+ "argmin",
+ idxmin,
+ "0.21.0",
+ msg=dedent(
+ """
The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
- row.""")
+ row."""
+ ),
)
argmax = deprecate(
- 'argmax', idxmax, '0.21.0',
- msg=dedent("""
+ "argmax",
+ idxmax,
+ "0.21.0",
+ msg=dedent(
+ """
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
- row.""")
+ row."""
+ ),
)
def round(self, decimals=0, *args, **kwargs):
@@ -2214,7 +2337,7 @@ def round(self, decimals=0, *args, **kwargs):
return result
- def quantile(self, q=0.5, interpolation='linear'):
+ def quantile(self, q=0.5, interpolation="linear"):
"""
Return value at the given quantile.
@@ -2265,21 +2388,18 @@ def quantile(self, q=0.5, interpolation='linear'):
# about 2D cases.
df = self.to_frame()
- result = df.quantile(q=q, interpolation=interpolation,
- numeric_only=False)
+ result = df.quantile(q=q, interpolation=interpolation, numeric_only=False)
if result.ndim == 2:
result = result.iloc[:, 0]
if is_list_like(q):
result.name = self.name
- return self._constructor(result,
- index=Float64Index(q),
- name=self.name)
+ return self._constructor(result, index=Float64Index(q), name=self.name)
else:
# scalar
return result.iloc[0]
- def corr(self, other, method='pearson', min_periods=None):
+ def corr(self, other, method="pearson", min_periods=None):
"""
Compute correlation with `other` Series, excluding missing values.
@@ -2315,17 +2435,20 @@ def corr(self, other, method='pearson', min_periods=None):
>>> s1.corr(s2, method=histogram_intersection)
0.3
"""
- this, other = self.align(other, join='inner', copy=False)
+ this, other = self.align(other, join="inner", copy=False)
if len(this) == 0:
return np.nan
- if method in ['pearson', 'spearman', 'kendall'] or callable(method):
- return nanops.nancorr(this.values, other.values, method=method,
- min_periods=min_periods)
+ if method in ["pearson", "spearman", "kendall"] or callable(method):
+ return nanops.nancorr(
+ this.values, other.values, method=method, min_periods=min_periods
+ )
- raise ValueError("method must be either 'pearson', "
- "'spearman', 'kendall', or a callable, "
- "'{method}' was supplied".format(method=method))
+ raise ValueError(
+ "method must be either 'pearson', "
+ "'spearman', 'kendall', or a callable, "
+ "'{method}' was supplied".format(method=method)
+ )
def cov(self, other, min_periods=None):
"""
@@ -2351,11 +2474,10 @@ def cov(self, other, min_periods=None):
>>> s1.cov(s2)
-0.01685762652715874
"""
- this, other = self.align(other, join='inner', copy=False)
+ this, other = self.align(other, join="inner", copy=False)
if len(this) == 0:
return np.nan
- return nanops.nancov(this.values, other.values,
- min_periods=min_periods)
+ return nanops.nancov(this.values, other.values, min_periods=min_periods)
def diff(self, periods=1):
"""
@@ -2517,11 +2639,11 @@ def dot(self, other):
array([24, 14])
"""
from pandas.core.frame import DataFrame
+
if isinstance(other, (Series, DataFrame)):
common = self.index.union(other.index)
- if (len(common) > len(self.index) or
- len(common) > len(other.index)):
- raise ValueError('matrices are not aligned')
+ if len(common) > len(self.index) or len(common) > len(other.index):
+ raise ValueError("matrices are not aligned")
left = self.reindex(index=common, copy=False)
right = other.reindex(index=common, copy=False)
@@ -2531,18 +2653,20 @@ def dot(self, other):
lvals = self.values
rvals = np.asarray(other)
if lvals.shape[0] != rvals.shape[0]:
- raise Exception('Dot product shape mismatch, %s vs %s' %
- (lvals.shape, rvals.shape))
+ raise Exception(
+ "Dot product shape mismatch, %s vs %s" % (lvals.shape, rvals.shape)
+ )
if isinstance(other, DataFrame):
- return self._constructor(np.dot(lvals, rvals),
- index=other.columns).__finalize__(self)
+ return self._constructor(
+ np.dot(lvals, rvals), index=other.columns
+ ).__finalize__(self)
elif isinstance(other, Series):
return np.dot(lvals, rvals)
elif isinstance(rvals, np.ndarray):
return np.dot(lvals, rvals)
else: # pragma: no cover
- raise TypeError('unsupported type: %s' % type(other))
+ raise TypeError("unsupported type: %s" % type(other))
def __matmul__(self, other):
"""
@@ -2556,11 +2680,10 @@ def __rmatmul__(self, other):
"""
return self.dot(np.transpose(other))
- @Substitution(klass='Series')
- @Appender(base._shared_docs['searchsorted'])
- def searchsorted(self, value, side='left', sorter=None):
- return algorithms.searchsorted(self._values, value,
- side=side, sorter=sorter)
+ @Substitution(klass="Series")
+ @Appender(base._shared_docs["searchsorted"])
+ def searchsorted(self, value, side="left", sorter=None):
+ return algorithms.searchsorted(self._values, value, side=side, sorter=sorter)
# -------------------------------------------------------------------
# Combination
@@ -2644,8 +2767,9 @@ def append(self, to_append, ignore_index=False, verify_integrity=False):
to_concat = [self] + to_append
else:
to_concat = [self, to_append]
- return concat(to_concat, ignore_index=ignore_index,
- verify_integrity=verify_integrity)
+ return concat(
+ to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity
+ )
def _binop(self, other, func, level=None, fill_value=None):
"""
@@ -2668,24 +2792,22 @@ def _binop(self, other, func, level=None, fill_value=None):
"""
if not isinstance(other, Series):
- raise AssertionError('Other operand must be Series')
+ raise AssertionError("Other operand must be Series")
new_index = self.index
this = self
if not self.index.equals(other.index):
- this, other = self.align(other, level=level, join='outer',
- copy=False)
+ this, other = self.align(other, level=level, join="outer", copy=False)
new_index = this.index
- this_vals, other_vals = ops.fill_binop(this.values, other.values,
- fill_value)
+ this_vals, other_vals = ops.fill_binop(this.values, other.values, fill_value)
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = func(this_vals, other_vals)
name = ops.get_op_result_name(self, other)
- if func.__name__ in ['divmod', 'rdivmod']:
+ if func.__name__ in ["divmod", "rdivmod"]:
ret = ops._construct_divmod_result(self, result, new_index, name)
else:
ret = ops._construct_result(self, result, new_index, name)
@@ -2770,13 +2892,13 @@ def combine(self, other, func, fill_value=None):
for idx in new_index:
lv = self.get(idx, fill_value)
rv = other.get(idx, fill_value)
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
new_values.append(func(lv, rv))
else:
# Assume that other is a scalar, so apply the function for
# each element in the Series
new_index = self.index
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
new_values = [func(lv, other) for lv in self._values]
new_name = self.name
@@ -2890,8 +3012,14 @@ def update(self, other):
# ----------------------------------------------------------------------
# Reindexing, sorting
- def sort_values(self, axis=0, ascending=True, inplace=False,
- kind='quicksort', na_position='last'):
+ def sort_values(
+ self,
+ axis=0,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ ):
"""
Sort by the values.
@@ -2996,14 +3124,16 @@ def sort_values(self, axis=0, ascending=True, inplace=False,
0 z
dtype: object
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
# Validate the axis parameter
self._get_axis_number(axis)
# GH 5856/5853
if inplace and self._is_cached:
- raise ValueError("This Series is a view of some other array, to "
- "sort in-place you must create a copy")
+ raise ValueError(
+ "This Series is a view of some other array, to "
+ "sort in-place you must create a copy"
+ )
def _try_kind_sort(arr):
# easier to ask forgiveness than permission
@@ -3013,7 +3143,7 @@ def _try_kind_sort(arr):
except TypeError:
# stable sort not available for object dtype
# uses the argsort default quicksort
- return arr.argsort(kind='quicksort')
+ return arr.argsort(kind="quicksort")
arr = self._values
sortedIdx = np.empty(len(self), dtype=np.int32)
@@ -3027,26 +3157,28 @@ def _try_kind_sort(arr):
if is_list_like(ascending):
if len(ascending) != 1:
- raise ValueError('Length of ascending (%d) must be 1 '
- 'for Series' % (len(ascending)))
+ raise ValueError(
+ "Length of ascending (%d) must be 1 "
+ "for Series" % (len(ascending))
+ )
ascending = ascending[0]
if not is_bool(ascending):
- raise ValueError('ascending must be boolean')
+ raise ValueError("ascending must be boolean")
if not ascending:
argsorted = argsorted[::-1]
- if na_position == 'last':
+ if na_position == "last":
n = good.sum()
sortedIdx[:n] = idx[good][argsorted]
sortedIdx[n:] = idx[bad]
- elif na_position == 'first':
+ elif na_position == "first":
n = bad.sum()
sortedIdx[n:] = idx[good][argsorted]
sortedIdx[:n] = idx[bad]
else:
- raise ValueError('invalid na_position: {!r}'.format(na_position))
+ raise ValueError("invalid na_position: {!r}".format(na_position))
result = self._constructor(arr[sortedIdx], index=self.index[sortedIdx])
@@ -3055,8 +3187,16 @@ def _try_kind_sort(arr):
else:
return result.__finalize__(self)
- def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
- kind='quicksort', na_position='last', sort_remaining=True):
+ def sort_index(
+ self,
+ axis=0,
+ level=None,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ sort_remaining=True,
+ ):
"""
Sort Series by index labels.
@@ -3169,34 +3309,40 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
"""
# TODO: this can be combined with DataFrame.sort_index impl as
# almost identical
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
# Validate the axis parameter
self._get_axis_number(axis)
index = self.index
if level is not None:
- new_index, indexer = index.sortlevel(level, ascending=ascending,
- sort_remaining=sort_remaining)
+ new_index, indexer = index.sortlevel(
+ level, ascending=ascending, sort_remaining=sort_remaining
+ )
elif isinstance(index, MultiIndex):
from pandas.core.sorting import lexsort_indexer
+
labels = index._sort_levels_monotonic()
- indexer = lexsort_indexer(labels._get_codes_for_sorting(),
- orders=ascending,
- na_position=na_position)
+ indexer = lexsort_indexer(
+ labels._get_codes_for_sorting(),
+ orders=ascending,
+ na_position=na_position,
+ )
else:
from pandas.core.sorting import nargsort
# Check monotonic-ness before sort an index
# GH11080
- if ((ascending and index.is_monotonic_increasing) or
- (not ascending and index.is_monotonic_decreasing)):
+ if (ascending and index.is_monotonic_increasing) or (
+ not ascending and index.is_monotonic_decreasing
+ ):
if inplace:
return
else:
return self.copy()
- indexer = nargsort(index, kind=kind, ascending=ascending,
- na_position=na_position)
+ indexer = nargsort(
+ index, kind=kind, ascending=ascending, na_position=na_position
+ )
indexer = ensure_platform_int(indexer)
new_index = index.take(indexer)
@@ -3210,7 +3356,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
else:
return result.__finalize__(self)
- def argsort(self, axis=0, kind='quicksort', order=None):
+ def argsort(self, axis=0, kind="quicksort", order=None):
"""
Override ndarray.argsort. Argsorts the value, omitting NA/null values,
and places the result in the same locations as the non-NA values.
@@ -3239,18 +3385,16 @@ def argsort(self, axis=0, kind='quicksort', order=None):
mask = isna(values)
if mask.any():
- result = Series(-1, index=self.index, name=self.name,
- dtype='int64')
+ result = Series(-1, index=self.index, name=self.name, dtype="int64")
notmask = ~mask
result[notmask] = np.argsort(values[notmask], kind=kind)
- return self._constructor(result,
- index=self.index).__finalize__(self)
+ return self._constructor(result, index=self.index).__finalize__(self)
else:
return self._constructor(
- np.argsort(values, kind=kind), index=self.index,
- dtype='int64').__finalize__(self)
+ np.argsort(values, kind=kind), index=self.index, dtype="int64"
+ ).__finalize__(self)
- def nlargest(self, n=5, keep='first'):
+ def nlargest(self, n=5, keep="first"):
"""
Return the largest `n` elements.
@@ -3348,7 +3492,7 @@ def nlargest(self, n=5, keep='first'):
"""
return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest()
- def nsmallest(self, n=5, keep='first'):
+ def nsmallest(self, n=5, keep="first"):
"""
Return the smallest `n` elements.
@@ -3465,8 +3609,9 @@ def swaplevel(self, i=-2, j=-1, copy=True):
the two innermost levels of the index.
"""
new_index = self.index.swaplevel(i, j)
- return self._constructor(self._values, index=new_index,
- copy=copy).__finalize__(self)
+ return self._constructor(self._values, index=new_index, copy=copy).__finalize__(
+ self
+ )
def reorder_levels(self, order):
"""
@@ -3484,7 +3629,7 @@ def reorder_levels(self, order):
type of caller (new object)
"""
if not isinstance(self.index, MultiIndex): # pragma: no cover
- raise Exception('Can only reorder levels on a hierarchical axis.')
+ raise Exception("Can only reorder levels on a hierarchical axis.")
result = self.copy()
result.index = result.index.reorder_levels(order)
@@ -3532,6 +3677,7 @@ def unstack(self, level=-1, fill_value=None):
b 2 4
"""
from pandas.core.reshape.reshape import unstack
+
return unstack(self, level, fill_value)
# ----------------------------------------------------------------------
@@ -3612,10 +3758,8 @@ def map(self, arg, na_action=None):
3 I am a rabbit
dtype: object
"""
- new_values = super()._map_values(
- arg, na_action=na_action)
- return self._constructor(new_values,
- index=self.index).__finalize__(self)
+ new_values = super()._map_values(arg, na_action=na_action)
+ return self._constructor(new_values, index=self.index).__finalize__(self)
def _gotitem(self, key, ndim, subset=None):
"""
@@ -3631,14 +3775,17 @@ def _gotitem(self, key, ndim, subset=None):
"""
return self
- _agg_see_also_doc = dedent("""
+ _agg_see_also_doc = dedent(
+ """
See Also
--------
Series.apply : Invoke function on a Series.
Series.transform : Transform function producing a Series with like indexes.
- """)
+ """
+ )
- _agg_examples_doc = dedent("""
+ _agg_examples_doc = dedent(
+ """
Examples
--------
>>> s = pd.Series([1, 2, 3, 4])
@@ -3656,13 +3803,16 @@ def _gotitem(self, key, ndim, subset=None):
min 1
max 4
dtype: int64
- """)
+ """
+ )
- @Substitution(see_also=_agg_see_also_doc,
- examples=_agg_examples_doc,
- versionadded='\n.. versionadded:: 0.20.0\n',
- **_shared_doc_kwargs)
- @Appender(generic._shared_docs['aggregate'])
+ @Substitution(
+ see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded="\n.. versionadded:: 0.20.0\n",
+ **_shared_doc_kwargs
+ )
+ @Appender(generic._shared_docs["aggregate"])
def aggregate(self, func, axis=0, *args, **kwargs):
# Validate the axis parameter
self._get_axis_number(axis)
@@ -3671,8 +3821,8 @@ def aggregate(self, func, axis=0, *args, **kwargs):
# we can be called from an inner function which
# passes this meta-data
- kwargs.pop('_axis', None)
- kwargs.pop('_level', None)
+ kwargs.pop("_axis", None)
+ kwargs.pop("_level", None)
# try a regular apply, this evaluates lambdas
# row-by-row; however if the lambda is expected a Series
@@ -3691,7 +3841,7 @@ def aggregate(self, func, axis=0, *args, **kwargs):
agg = aggregate
- @Appender(generic._shared_docs['transform'] % _shared_doc_kwargs)
+ @Appender(generic._shared_docs["transform"] % _shared_doc_kwargs)
def transform(self, func, axis=0, *args, **kwargs):
# Validate the axis parameter
self._get_axis_number(axis)
@@ -3795,8 +3945,9 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
dtype: float64
"""
if len(self) == 0:
- return self._constructor(dtype=self.dtype,
- index=self.index).__finalize__(self)
+ return self._constructor(dtype=self.dtype, index=self.index).__finalize__(
+ self
+ )
# dispatch to agg
if isinstance(func, (list, dict)):
@@ -3808,12 +3959,14 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
# handle ufuncs and lambdas
if kwds or args and not isinstance(func, np.ufunc):
+
def f(x):
return func(x, *args, **kwds)
+
else:
f = func
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
if isinstance(f, np.ufunc):
return f(self)
@@ -3827,14 +3980,13 @@ def f(x):
if len(mapped) and isinstance(mapped[0], Series):
# GH 25959 use pd.array instead of tolist
# so extension arrays can be used
- return self._constructor_expanddim(pd.array(mapped),
- index=self.index)
+ return self._constructor_expanddim(pd.array(mapped), index=self.index)
else:
- return self._constructor(mapped,
- index=self.index).__finalize__(self)
+ return self._constructor(mapped, index=self.index).__finalize__(self)
- def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
- filter_type=None, **kwds):
+ def _reduce(
+ self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
+ ):
"""
Perform a reduction operation.
@@ -3864,17 +4016,24 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
# dispatch to numpy arrays
elif isinstance(delegate, np.ndarray):
if numeric_only:
- raise NotImplementedError('Series.{0} does not implement '
- 'numeric_only.'.format(name))
- with np.errstate(all='ignore'):
+ raise NotImplementedError(
+ "Series.{0} does not implement " "numeric_only.".format(name)
+ )
+ with np.errstate(all="ignore"):
return op(delegate, skipna=skipna, **kwds)
# TODO(EA) dispatch to Index
# remove once all internals extension types are
# moved to ExtensionArrays
- return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna,
- numeric_only=numeric_only,
- filter_type=filter_type, **kwds)
+ return delegate._reduce(
+ op=op,
+ name=name,
+ axis=axis,
+ skipna=skipna,
+ numeric_only=numeric_only,
+ filter_type=filter_type,
+ **kwds
+ )
def _reindex_indexer(self, new_index, indexer, copy):
if indexer is None:
@@ -3882,8 +4041,9 @@ def _reindex_indexer(self, new_index, indexer, copy):
return self.copy()
return self
- new_values = algorithms.take_1d(self._values, indexer,
- allow_fill=True, fill_value=None)
+ new_values = algorithms.take_1d(
+ self._values, indexer, allow_fill=True, fill_value=None
+ )
return self._constructor(new_values, index=new_index)
def _needs_reindex_multi(self, axes, method, level):
@@ -3893,14 +4053,32 @@ def _needs_reindex_multi(self, axes, method, level):
"""
return False
- @Appender(generic._shared_docs['align'] % _shared_doc_kwargs)
- def align(self, other, join='outer', axis=None, level=None, copy=True,
- fill_value=None, method=None, limit=None, fill_axis=0,
- broadcast_axis=None):
- return super().align(other, join=join, axis=axis, level=level,
- copy=copy, fill_value=fill_value, method=method,
- limit=limit, fill_axis=fill_axis,
- broadcast_axis=broadcast_axis)
+ @Appender(generic._shared_docs["align"] % _shared_doc_kwargs)
+ def align(
+ self,
+ other,
+ join="outer",
+ axis=None,
+ level=None,
+ copy=True,
+ fill_value=None,
+ method=None,
+ limit=None,
+ fill_axis=0,
+ broadcast_axis=None,
+ ):
+ return super().align(
+ other,
+ join=join,
+ axis=axis,
+ level=level,
+ copy=copy,
+ fill_value=fill_value,
+ method=method,
+ limit=limit,
+ fill_axis=fill_axis,
+ broadcast_axis=broadcast_axis,
+ )
def rename(self, index=None, **kwargs):
"""
@@ -3963,13 +4141,13 @@ def rename(self, index=None, **kwargs):
5 3
dtype: int64
"""
- kwargs['inplace'] = validate_bool_kwarg(kwargs.get('inplace', False),
- 'inplace')
+ kwargs["inplace"] = validate_bool_kwarg(kwargs.get("inplace", False), "inplace")
- non_mapping = is_scalar(index) or (is_list_like(index) and
- not is_dict_like(index))
+ non_mapping = is_scalar(index) or (
+ is_list_like(index) and not is_dict_like(index)
+ )
if non_mapping:
- return self._set_name(index, inplace=kwargs.get('inplace'))
+ return self._set_name(index, inplace=kwargs.get("inplace"))
return super().rename(index=index, **kwargs)
@Substitution(**_shared_doc_kwargs)
@@ -3977,8 +4155,16 @@ def rename(self, index=None, **kwargs):
def reindex(self, index=None, **kwargs):
return super().reindex(index=index, **kwargs)
- def drop(self, labels=None, axis=0, index=None, columns=None,
- level=None, inplace=False, errors='raise'):
+ def drop(
+ self,
+ labels=None,
+ axis=0,
+ index=None,
+ columns=None,
+ level=None,
+ inplace=False,
+ errors="raise",
+ ):
"""
Return Series with specified index labels removed.
@@ -4065,29 +4251,62 @@ def drop(self, labels=None, axis=0, index=None, columns=None,
length 0.3
dtype: float64
"""
- return super().drop(labels=labels, axis=axis, index=index,
- columns=columns, level=level, inplace=inplace,
- errors=errors)
+ return super().drop(
+ labels=labels,
+ axis=axis,
+ index=index,
+ columns=columns,
+ level=level,
+ inplace=inplace,
+ errors=errors,
+ )
@Substitution(**_shared_doc_kwargs)
@Appender(generic.NDFrame.fillna.__doc__)
- def fillna(self, value=None, method=None, axis=None, inplace=False,
- limit=None, downcast=None, **kwargs):
- return super().fillna(value=value, method=method, axis=axis,
- inplace=inplace, limit=limit, downcast=downcast,
- **kwargs)
-
- @Appender(generic._shared_docs['replace'] % _shared_doc_kwargs)
- def replace(self, to_replace=None, value=None, inplace=False, limit=None,
- regex=False, method='pad'):
- return super().replace(to_replace=to_replace, value=value,
- inplace=inplace, limit=limit, regex=regex,
- method=method)
-
- @Appender(generic._shared_docs['shift'] % _shared_doc_kwargs)
+ def fillna(
+ self,
+ value=None,
+ method=None,
+ axis=None,
+ inplace=False,
+ limit=None,
+ downcast=None,
+ **kwargs
+ ):
+ return super().fillna(
+ value=value,
+ method=method,
+ axis=axis,
+ inplace=inplace,
+ limit=limit,
+ downcast=downcast,
+ **kwargs
+ )
+
+ @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs)
+ def replace(
+ self,
+ to_replace=None,
+ value=None,
+ inplace=False,
+ limit=None,
+ regex=False,
+ method="pad",
+ ):
+ return super().replace(
+ to_replace=to_replace,
+ value=value,
+ inplace=inplace,
+ limit=limit,
+ regex=regex,
+ method=method,
+ )
+
+ @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs)
def shift(self, periods=1, freq=None, axis=0, fill_value=None):
- return super().shift(periods=periods, freq=freq, axis=axis,
- fill_value=fill_value)
+ return super().shift(
+ periods=periods, freq=freq, axis=axis, fill_value=fill_value
+ )
def memory_usage(self, index=True, deep=False):
"""
@@ -4153,13 +4372,14 @@ def _take(self, indices, axis=0, is_copy=False):
# https://github.com/pandas-dev/pandas/issues/20664
# TODO: remove when the default Categorical.take behavior changes
indices = maybe_convert_indices(indices, len(self._get_axis(axis)))
- kwargs = {'allow_fill': False}
+ kwargs = {"allow_fill": False}
else:
kwargs = {}
new_values = self._values.take(indices, **kwargs)
- result = (self._constructor(new_values, index=new_index,
- fastpath=True).__finalize__(self))
+ result = self._constructor(
+ new_values, index=new_index, fastpath=True
+ ).__finalize__(self)
# Maybe set copy if we didn't actually change the index.
if is_copy:
@@ -4306,21 +4526,51 @@ def between(self, left, right, inclusive=True):
@Appender(generic.NDFrame.to_csv.__doc__)
def to_csv(self, *args, **kwargs):
- names = ["path_or_buf", "sep", "na_rep", "float_format", "columns",
- "header", "index", "index_label", "mode", "encoding",
- "compression", "quoting", "quotechar", "line_terminator",
- "chunksize", "date_format", "doublequote",
- "escapechar", "decimal"]
-
- old_names = ["path_or_buf", "index", "sep", "na_rep", "float_format",
- "header", "index_label", "mode", "encoding",
- "compression", "date_format", "decimal"]
+ names = [
+ "path_or_buf",
+ "sep",
+ "na_rep",
+ "float_format",
+ "columns",
+ "header",
+ "index",
+ "index_label",
+ "mode",
+ "encoding",
+ "compression",
+ "quoting",
+ "quotechar",
+ "line_terminator",
+ "chunksize",
+ "date_format",
+ "doublequote",
+ "escapechar",
+ "decimal",
+ ]
+
+ old_names = [
+ "path_or_buf",
+ "index",
+ "sep",
+ "na_rep",
+ "float_format",
+ "header",
+ "index_label",
+ "mode",
+ "encoding",
+ "compression",
+ "date_format",
+ "decimal",
+ ]
if "path" in kwargs:
- warnings.warn("The signature of `Series.to_csv` was aligned "
- "to that of `DataFrame.to_csv`, and argument "
- "'path' will be renamed to 'path_or_buf'.",
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "The signature of `Series.to_csv` was aligned "
+ "to that of `DataFrame.to_csv`, and argument "
+ "'path' will be renamed to 'path_or_buf'.",
+ FutureWarning,
+ stacklevel=2,
+ )
kwargs["path_or_buf"] = kwargs.pop("path")
if len(args) > 1:
@@ -4330,49 +4580,57 @@ def to_csv(self, *args, **kwargs):
if not (is_string_like(maybe_sep) and len(maybe_sep) == 1):
# old signature
- warnings.warn("The signature of `Series.to_csv` was aligned "
- "to that of `DataFrame.to_csv`. Note that the "
- "order of arguments changed, and the new one "
- "has 'sep' in first place, for which \"{}\" is "
- "not a valid value. The old order will cease to "
- "be supported in a future version. Please refer "
- "to the documentation for `DataFrame.to_csv` "
- "when updating your function "
- "calls.".format(maybe_sep),
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ "The signature of `Series.to_csv` was aligned "
+ "to that of `DataFrame.to_csv`. Note that the "
+ "order of arguments changed, and the new one "
+ "has 'sep' in first place, for which \"{}\" is "
+ "not a valid value. The old order will cease to "
+ "be supported in a future version. Please refer "
+ "to the documentation for `DataFrame.to_csv` "
+ "when updating your function "
+ "calls.".format(maybe_sep),
+ FutureWarning,
+ stacklevel=2,
+ )
names = old_names
- pos_args = dict(zip(names[:len(args)], args))
+ pos_args = dict(zip(names[: len(args)], args))
for key in pos_args:
if key in kwargs:
- raise ValueError("Argument given by name ('{}') and position "
- "({})".format(key, names.index(key)))
+ raise ValueError(
+ "Argument given by name ('{}') and position "
+ "({})".format(key, names.index(key))
+ )
kwargs[key] = pos_args[key]
if kwargs.get("header", None) is None:
- warnings.warn("The signature of `Series.to_csv` was aligned "
- "to that of `DataFrame.to_csv`, and argument "
- "'header' will change its default value from False "
- "to True: please pass an explicit value to suppress "
- "this warning.", FutureWarning,
- stacklevel=2)
+ warnings.warn(
+ "The signature of `Series.to_csv` was aligned "
+ "to that of `DataFrame.to_csv`, and argument "
+ "'header' will change its default value from False "
+ "to True: please pass an explicit value to suppress "
+ "this warning.",
+ FutureWarning,
+ stacklevel=2,
+ )
kwargs["header"] = False # Backwards compatibility.
return self.to_frame().to_csv(**kwargs)
- @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
+ @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs)
def isna(self):
return super().isna()
- @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
+ @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs)
def isnull(self):
return super().isnull()
- @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)
+ @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs)
def notna(self):
return super().notna()
- @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)
+ @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs)
def notnull(self):
return super().notnull()
@@ -4447,11 +4705,13 @@ def dropna(self, axis=0, inplace=False, **kwargs):
5 I stay
dtype: object
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
- kwargs.pop('how', None)
+ inplace = validate_bool_kwarg(inplace, "inplace")
+ kwargs.pop("how", None)
if kwargs:
- raise TypeError('dropna() got an unexpected keyword '
- 'argument "{0}"'.format(list(kwargs.keys())[0]))
+ raise TypeError(
+ "dropna() got an unexpected keyword "
+ 'argument "{0}"'.format(list(kwargs.keys())[0])
+ )
# Validate the axis parameter
self._get_axis_number(axis or 0)
@@ -4480,14 +4740,18 @@ def valid(self, inplace=False, **kwargs):
Series
Series without null values.
"""
- warnings.warn("Method .valid will be removed in a future version. "
- "Use .dropna instead.", FutureWarning, stacklevel=2)
+ warnings.warn(
+ "Method .valid will be removed in a future version. "
+ "Use .dropna instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
return self.dropna(inplace=inplace, **kwargs)
# ----------------------------------------------------------------------
# Time series-oriented methods
- def to_timestamp(self, freq=None, how='start', copy=True):
+ def to_timestamp(self, freq=None, how="start", copy=True):
"""
Cast to DatetimeIndex of Timestamps, at *beginning* of period.
@@ -4510,8 +4774,7 @@ def to_timestamp(self, freq=None, how='start', copy=True):
new_values = new_values.copy()
new_index = self.index.to_timestamp(freq=freq, how=how)
- return self._constructor(new_values,
- index=new_index).__finalize__(self)
+ return self._constructor(new_values, index=new_index).__finalize__(self)
def to_period(self, freq=None, copy=True):
"""
@@ -4535,8 +4798,7 @@ def to_period(self, freq=None, copy=True):
new_values = new_values.copy()
new_index = self.index.to_period(freq=freq)
- return self._constructor(new_values,
- index=new_index).__finalize__(self)
+ return self._constructor(new_values, index=new_index).__finalize__(self)
# ----------------------------------------------------------------------
# Accessor Methods
@@ -4552,8 +4814,13 @@ def to_period(self, freq=None, copy=True):
hist = pandas.plotting.hist_series
-Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0},
- docs={'index': 'The index (axis labels) of the Series.'})
+Series._setup_axes(
+ ["index"],
+ info_axis=0,
+ stat_axis=0,
+ aliases={"rows": 0},
+ docs={"index": "The index (axis labels) of the Series."},
+)
Series._add_numeric_operations()
Series._add_series_only_operations()
Series._add_series_or_dataframe_operations()
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index b79390581612b..523c4dc5e867b 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -6,8 +6,12 @@
from pandas.core.dtypes.cast import infer_dtype_from_array
from pandas.core.dtypes.common import (
- ensure_int64, ensure_platform_int, is_categorical_dtype,
- is_extension_array_dtype, is_list_like)
+ ensure_int64,
+ ensure_platform_int,
+ is_categorical_dtype,
+ is_extension_array_dtype,
+ is_list_like,
+)
from pandas.core.dtypes.missing import isna
import pandas.core.algorithms as algorithms
@@ -42,6 +46,7 @@ def get_group_index(labels, shape, sort, xnull):
An array of type int64 where two elements are equal if their corresponding
labels are equal at all location.
"""
+
def _int64_cut_off(shape):
acc = 1
for i, mul in enumerate(shape):
@@ -69,8 +74,8 @@ def maybe_lift(lab, size):
nlev = _int64_cut_off(shape)
# compute flat ids for the first `nlev` levels
- stride = np.prod(shape[1:nlev], dtype='i8')
- out = stride * labels[0].astype('i8', subok=False, copy=False)
+ stride = np.prod(shape[1:nlev], dtype="i8")
+ out = stride * labels[0].astype("i8", subok=False, copy=False)
for i in range(1, nlev):
if shape[i] == 0:
@@ -132,7 +137,7 @@ def decons_group_index(comp_labels, shape):
if is_int64_overflow_possible(shape):
# at some point group indices are factorized,
# and may not be deconstructed here! wrong path!
- raise ValueError('cannot deconstruct factorized group indices!')
+ raise ValueError("cannot deconstruct factorized group indices!")
label_list = []
factor = 1
@@ -158,17 +163,16 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull):
"""
if not xnull:
- lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8')
- shape = np.asarray(shape, dtype='i8') + lift
+ lift = np.fromiter(((a == -1).any() for a in labels), dtype="i8")
+ shape = np.asarray(shape, dtype="i8") + lift
if not is_int64_overflow_possible(shape):
# obs ids are deconstructable! take the fast route!
out = decons_group_index(obs_ids, shape)
- return out if xnull or not lift.any() \
- else [x - y for x, y in zip(out, lift)]
+ return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]
i = unique_label_indices(comp_ids)
- i8copy = lambda a: a.astype('i8', subok=False, copy=True)
+ i8copy = lambda a: a.astype("i8", subok=False, copy=True)
return [i8copy(lab[i]) for lab in labels]
@@ -184,7 +188,7 @@ def indexer_from_factorized(labels, shape, compress=True):
return get_group_index_sorter(ids, ngroups)
-def lexsort_indexer(keys, orders=None, na_position='last'):
+def lexsort_indexer(keys, orders=None, na_position="last"):
from pandas.core.arrays import Categorical
labels = []
@@ -204,22 +208,22 @@ def lexsort_indexer(keys, orders=None, na_position='last'):
else:
c = Categorical(key, ordered=True)
- if na_position not in ['last', 'first']:
- raise ValueError('invalid na_position: {!r}'.format(na_position))
+ if na_position not in ["last", "first"]:
+ raise ValueError("invalid na_position: {!r}".format(na_position))
n = len(c.categories)
codes = c.codes.copy()
- mask = (c.codes == -1)
+ mask = c.codes == -1
if order: # ascending
- if na_position == 'last':
+ if na_position == "last":
codes = np.where(mask, n, codes)
- elif na_position == 'first':
+ elif na_position == "first":
codes += 1
else: # not order means descending
- if na_position == 'last':
+ if na_position == "last":
codes = np.where(mask, n, n - codes - 1)
- elif na_position == 'first':
+ elif na_position == "first":
codes = np.where(mask, 0, n - codes)
if mask.any():
n += 1
@@ -230,7 +234,7 @@ def lexsort_indexer(keys, orders=None, na_position='last'):
return indexer_from_factorized(labels, shape)
-def nargsort(items, kind='quicksort', ascending=True, na_position='last'):
+def nargsort(items, kind="quicksort", ascending=True, na_position="last"):
"""
This is intended to be a drop-in replacement for np.argsort which
handles NaNs. It adds ascending and na_position parameters.
@@ -258,12 +262,12 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'):
indexer = indexer[::-1]
# Finally, place the NaNs at the end or the beginning according to
# na_position
- if na_position == 'last':
+ if na_position == "last":
indexer = np.concatenate([indexer, nan_idx])
- elif na_position == 'first':
+ elif na_position == "first":
indexer = np.concatenate([nan_idx, indexer])
else:
- raise ValueError('invalid na_position: {!r}'.format(na_position))
+ raise ValueError("invalid na_position: {!r}".format(na_position))
return indexer
@@ -279,8 +283,7 @@ def __init__(self, comp_ids, ngroups, levels, labels):
self.comp_ids = comp_ids.astype(np.int64)
self.k = len(labels)
- self.tables = [hashtable.Int64HashTable(ngroups)
- for _ in range(self.k)]
+ self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)]
self._populate_tables()
@@ -289,8 +292,10 @@ def _populate_tables(self):
table.map(self.comp_ids, labs.astype(np.int64))
def get_key(self, comp_id):
- return tuple(level[table.get_item(comp_id)]
- for table, level in zip(self.tables, self.levels))
+ return tuple(
+ level[table.get_item(comp_id)]
+ for table, level in zip(self.tables, self.levels)
+ )
def get_flattened_iterator(comp_ids, ngroups, levels, labels):
@@ -304,9 +309,11 @@ def get_indexer_dict(label_list, keys):
shape = list(map(len, keys))
group_index = get_group_index(label_list, shape, sort=True, xnull=True)
- ngroups = ((group_index.size and group_index.max()) + 1) \
- if is_int64_overflow_possible(shape) \
- else np.prod(shape, dtype='i8')
+ ngroups = (
+ ((group_index.size and group_index.max()) + 1)
+ if is_int64_overflow_possible(shape)
+ else np.prod(shape, dtype="i8")
+ )
sorter = get_group_index_sorter(group_index, ngroups)
@@ -319,6 +326,7 @@ def get_indexer_dict(label_list, keys):
# ----------------------------------------------------------------------
# sorting levels...cleverly?
+
def get_group_index_sorter(group_index, ngroups):
"""
algos.groupsort_indexer implements `counting sort` and it is at least
@@ -336,14 +344,12 @@ def get_group_index_sorter(group_index, ngroups):
count = len(group_index)
alpha = 0.0 # taking complexities literally; there may be
beta = 1.0 # some room for fine-tuning these parameters
- do_groupsort = (count > 0 and ((alpha + beta * ngroups) <
- (count * np.log(count))))
+ do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))
if do_groupsort:
- sorter, _ = algos.groupsort_indexer(ensure_int64(group_index),
- ngroups)
+ sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
return ensure_platform_int(sorter)
else:
- return group_index.argsort(kind='mergesort')
+ return group_index.argsort(kind="mergesort")
def compress_group_index(group_index, sort=True):
@@ -387,8 +393,7 @@ def _reorder_by_uniques(uniques, labels):
return uniques, labels
-def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False,
- verify=True):
+def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True):
"""
Sort ``values`` and reorder corresponding ``labels``.
``values`` should be unique if ``labels`` is not None.
@@ -433,26 +438,27 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False,
* If ``labels`` is not None and ``values`` contain duplicates.
"""
if not is_list_like(values):
- raise TypeError("Only list-like objects are allowed to be passed to"
- "safe_sort as values")
+ raise TypeError(
+ "Only list-like objects are allowed to be passed to" "safe_sort as values"
+ )
- if (not isinstance(values, np.ndarray)
- and not is_extension_array_dtype(values)):
+ if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values):
# don't convert to string types
dtype, _ = infer_dtype_from_array(values)
values = np.asarray(values, dtype=dtype)
def sort_mixed(values):
# order ints before strings, safe in py3
- str_pos = np.array([isinstance(x, str) for x in values],
- dtype=bool)
+ str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
nums = np.sort(values[~str_pos])
strs = np.sort(values[str_pos])
return np.concatenate([nums, np.asarray(strs, dtype=object)])
sorter = None
- if (not is_extension_array_dtype(values)
- and lib.infer_dtype(values, skipna=False) == 'mixed-integer'):
+ if (
+ not is_extension_array_dtype(values)
+ and lib.infer_dtype(values, skipna=False) == "mixed-integer"
+ ):
# unorderable in py3 if mixed str/int
ordered = sort_mixed(values)
else:
@@ -469,18 +475,22 @@ def sort_mixed(values):
return ordered
if not is_list_like(labels):
- raise TypeError("Only list-like objects or None are allowed to be"
- "passed to safe_sort as labels")
+ raise TypeError(
+ "Only list-like objects or None are allowed to be"
+ "passed to safe_sort as labels"
+ )
labels = ensure_platform_int(np.asarray(labels))
from pandas import Index
+
if not assume_unique and not Index(values).is_unique:
raise ValueError("values should be unique if labels is not None")
if sorter is None:
# mixed types
(hash_klass, _), values = algorithms._get_data_algo(
- values, algorithms._hashtables)
+ values, algorithms._hashtables
+ )
t = hash_klass(len(values))
t.map_locations(values)
sorter = ensure_platform_int(t.lookup(ordered))
@@ -498,7 +508,7 @@ def sort_mixed(values):
reverse_indexer.put(sorter, np.arange(len(sorter)))
# Out of bound indices will be masked with `na_sentinel` next, so we
# may deal with them here without performance loss using `mode='wrap'`
- new_labels = reverse_indexer.take(labels, mode='wrap')
+ new_labels = reverse_indexer.take(labels, mode="wrap")
mask = labels == na_sentinel
if verify:
diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
index 6a0ba5f93c509..f195e4b5f4e37 100644
--- a/pandas/core/sparse/frame.py
+++ b/pandas/core/sparse/frame.py
@@ -20,14 +20,13 @@
from pandas.core.frame import DataFrame
import pandas.core.generic as generic
from pandas.core.index import Index, MultiIndex, ensure_index
-from pandas.core.internals import (
- BlockManager, create_block_manager_from_arrays)
+from pandas.core.internals import BlockManager, create_block_manager_from_arrays
from pandas.core.internals.construction import extract_index, prep_ndarray
import pandas.core.ops as ops
from pandas.core.series import Series
from pandas.core.sparse.series import SparseSeries
-_shared_doc_kwargs = dict(klass='SparseDataFrame')
+_shared_doc_kwargs = dict(klass="SparseDataFrame")
depr_msg = """\
SparseDataFrame is deprecated and will be removed in a future version.
Use a regular DataFrame whose columns are SparseArrays instead.
@@ -62,10 +61,19 @@ class SparseDataFrame(DataFrame):
Default fill_value for converting Series to SparseSeries
(default: nan). Will not override SparseSeries passed in.
"""
- _subtyp = 'sparse_frame'
- def __init__(self, data=None, index=None, columns=None, default_kind=None,
- default_fill_value=None, dtype=None, copy=False):
+ _subtyp = "sparse_frame"
+
+ def __init__(
+ self,
+ data=None,
+ index=None,
+ columns=None,
+ default_kind=None,
+ default_fill_value=None,
+ dtype=None,
+ copy=False,
+ ):
warnings.warn(depr_msg, FutureWarning, stacklevel=2)
# pick up the defaults from the Sparse structures
@@ -83,7 +91,7 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
index = data.index
if default_fill_value is None:
default_fill_value = data.fill_value
- if columns is None and hasattr(data, 'name'):
+ if columns is None and hasattr(data, "name"):
columns = [data.name]
if columns is None:
raise Exception("cannot pass a series w/o a name or columns")
@@ -92,30 +100,33 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
if default_fill_value is None:
default_fill_value = np.nan
if default_kind is None:
- default_kind = 'block'
+ default_kind = "block"
self._default_kind = default_kind
self._default_fill_value = default_fill_value
if is_scipy_sparse(data):
- mgr = self._init_spmatrix(data, index, columns, dtype=dtype,
- fill_value=default_fill_value)
+ mgr = self._init_spmatrix(
+ data, index, columns, dtype=dtype, fill_value=default_fill_value
+ )
elif isinstance(data, dict):
mgr = self._init_dict(data, index, columns, dtype=dtype)
elif isinstance(data, (np.ndarray, list)):
mgr = self._init_matrix(data, index, columns, dtype=dtype)
elif isinstance(data, SparseDataFrame):
- mgr = self._init_mgr(data._data,
- dict(index=index, columns=columns),
- dtype=dtype, copy=copy)
+ mgr = self._init_mgr(
+ data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy
+ )
elif isinstance(data, DataFrame):
mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
elif isinstance(data, Series):
- mgr = self._init_dict(data.to_frame(), data.index,
- columns=None, dtype=dtype)
+ mgr = self._init_dict(
+ data.to_frame(), data.index, columns=None, dtype=dtype
+ )
elif isinstance(data, BlockManager):
- mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
- dtype=dtype, copy=copy)
+ mgr = self._init_mgr(
+ data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
+ )
elif data is None:
data = DataFrame()
@@ -128,15 +139,20 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
columns = Index([])
else:
for c in columns:
- data[c] = SparseArray(self._default_fill_value,
- index=index, kind=self._default_kind,
- fill_value=self._default_fill_value)
+ data[c] = SparseArray(
+ self._default_fill_value,
+ index=index,
+ kind=self._default_kind,
+ fill_value=self._default_fill_value,
+ )
mgr = to_manager(data, columns, index)
if dtype is not None:
mgr = mgr.astype(dtype)
else:
- msg = ('SparseDataFrame called with unknown type "{data_type}" '
- 'for data argument')
+ msg = (
+ 'SparseDataFrame called with unknown type "{data_type}" '
+ "for data argument"
+ )
raise TypeError(msg.format(data_type=type(data).__name__))
generic.NDFrame.__init__(self, mgr)
@@ -160,9 +176,14 @@ def _init_dict(self, data, index, columns, dtype=None):
index = extract_index(list(data.values()))
def sp_maker(x):
- return SparseArray(x, kind=self._default_kind,
- fill_value=self._default_fill_value,
- copy=True, dtype=dtype)
+ return SparseArray(
+ x,
+ kind=self._default_kind,
+ fill_value=self._default_fill_value,
+ copy=True,
+ dtype=dtype,
+ )
+
sdict = {}
for k, v in data.items():
if isinstance(v, Series):
@@ -188,11 +209,14 @@ def sp_maker(x):
if len(columns.difference(sdict)):
# TODO: figure out how to handle this case, all nan's?
# add in any other columns we want to have (completeness)
- nan_arr = np.empty(len(index), dtype='float64')
+ nan_arr = np.empty(len(index), dtype="float64")
nan_arr.fill(np.nan)
- nan_arr = SparseArray(nan_arr, kind=self._default_kind,
- fill_value=self._default_fill_value,
- copy=False)
+ nan_arr = SparseArray(
+ nan_arr,
+ kind=self._default_kind,
+ fill_value=self._default_fill_value,
+ copy=False,
+ )
sdict.update((c, nan_arr) for c in columns if c not in sdict)
return to_manager(sdict, columns, index)
@@ -206,8 +230,7 @@ def _init_matrix(self, data, index, columns, dtype=None):
data = {idx: data[:, i] for i, idx in enumerate(columns)}
return self._init_dict(data, index, columns, dtype)
- def _init_spmatrix(self, data, index, columns, dtype=None,
- fill_value=None):
+ def _init_spmatrix(self, data, index, columns, dtype=None, fill_value=None):
"""
Init self from scipy.sparse matrix.
"""
@@ -225,16 +248,24 @@ def _init_spmatrix(self, data, index, columns, dtype=None,
blocs, blens = get_blocks(rows)
sdict[columns[col]] = SparseSeries(
- rowvals.values, index=index,
+ rowvals.values,
+ index=index,
fill_value=fill_value,
- sparse_index=BlockIndex(N, blocs, blens))
+ sparse_index=BlockIndex(N, blocs, blens),
+ )
# Add any columns that were empty and thus not grouped on above
- sdict.update({column: SparseSeries(index=index,
- fill_value=fill_value,
- sparse_index=BlockIndex(N, [], []))
- for column in columns
- if column not in sdict})
+ sdict.update(
+ {
+ column: SparseSeries(
+ index=index,
+ fill_value=fill_value,
+ sparse_index=BlockIndex(N, [], []),
+ )
+ for column in columns
+ if column not in sdict
+ }
+ )
return self._init_dict(sdict, index, columns, dtype)
@@ -249,9 +280,13 @@ def __repr__(self):
def __getstate__(self):
# pickling
- return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data,
- _default_fill_value=self._default_fill_value,
- _default_kind=self._default_kind)
+ return dict(
+ _typ=self._typ,
+ _subtyp=self._subtyp,
+ _data=self._data,
+ _default_fill_value=self._default_fill_value,
+ _default_kind=self._default_kind,
+ )
def _unpickle_sparse_frame_compat(self, state):
"""
@@ -261,20 +296,23 @@ def _unpickle_sparse_frame_compat(self, state):
if not isinstance(cols, Index): # pragma: no cover
from pandas.io.pickle import _unpickle_array
+
columns = _unpickle_array(cols)
else:
columns = cols
if not isinstance(idx, Index): # pragma: no cover
from pandas.io.pickle import _unpickle_array
+
index = _unpickle_array(idx)
else:
index = idx
series_dict = DataFrame()
for col, (sp_index, sp_values) in series.items():
- series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index,
- fill_value=fv)
+ series_dict[col] = SparseSeries(
+ sp_values, sparse_index=sp_index, fill_value=fv
+ )
self._data = to_manager(series_dict, columns, index)
self._default_fill_value = fv
@@ -289,12 +327,14 @@ def _apply_columns(self, func):
Get new SparseDataFrame applying func to each columns
"""
- new_data = {col: func(series)
- for col, series in self.items()}
+ new_data = {col: func(series) for col, series in self.items()}
return self._constructor(
- data=new_data, index=self.index, columns=self.columns,
- default_fill_value=self.default_fill_value).__finalize__(self)
+ data=new_data,
+ index=self.index,
+ columns=self.columns,
+ default_fill_value=self.default_fill_value,
+ ).__finalize__(self)
def astype(self, dtype):
return self._apply_columns(lambda x: x.astype(dtype))
@@ -322,23 +362,27 @@ def density(self):
Ratio of non-sparse points to total (dense) data points
represented in the frame
"""
- tot_nonsparse = sum(ser.sp_index.npoints
- for _, ser in self.items())
+ tot_nonsparse = sum(ser.sp_index.npoints for _, ser in self.items())
tot = len(self.index) * len(self.columns)
return tot_nonsparse / float(tot)
- def fillna(self, value=None, method=None, axis=0, inplace=False,
- limit=None, downcast=None):
- new_self = super().fillna(value=value, method=method, axis=axis,
- inplace=inplace, limit=limit,
- downcast=downcast)
+ def fillna(
+ self, value=None, method=None, axis=0, inplace=False, limit=None, downcast=None
+ ):
+ new_self = super().fillna(
+ value=value,
+ method=method,
+ axis=axis,
+ inplace=inplace,
+ limit=limit,
+ downcast=downcast,
+ )
if not inplace:
self = new_self
# set the fill value if we are filling as a scalar with nothing special
# going on
- if (value is not None and value == value and method is None and
- limit is None):
+ if value is not None and value == value and method is None and limit is None:
self._default_fill_value = value
if not inplace:
@@ -362,29 +406,35 @@ def _sanitize_column(self, key, value, **kwargs):
sanitized_column : SparseArray
"""
+
def sp_maker(x, index=None):
- return SparseArray(x, index=index,
- fill_value=self._default_fill_value,
- kind=self._default_kind)
+ return SparseArray(
+ x,
+ index=index,
+ fill_value=self._default_fill_value,
+ kind=self._default_kind,
+ )
+
if isinstance(value, SparseSeries):
clean = value.reindex(self.index).as_sparse_array(
- fill_value=self._default_fill_value, kind=self._default_kind)
+ fill_value=self._default_fill_value, kind=self._default_kind
+ )
elif isinstance(value, SparseArray):
if len(value) != len(self.index):
- raise ValueError('Length of values does not match '
- 'length of index')
+ raise ValueError("Length of values does not match " "length of index")
clean = value
- elif hasattr(value, '__iter__'):
+ elif hasattr(value, "__iter__"):
if isinstance(value, Series):
clean = value.reindex(self.index)
if not isinstance(value, SparseSeries):
clean = sp_maker(clean)
else:
if len(value) != len(self.index):
- raise ValueError('Length of values does not match '
- 'length of index')
+ raise ValueError(
+ "Length of values does not match " "length of index"
+ )
clean = sp_maker(value)
# Scalar
@@ -412,10 +462,13 @@ def get_value(self, index, col, takeable=False):
-------
value : scalar value
"""
- warnings.warn("get_value is deprecated and will be removed "
- "in a future release. Please use "
- ".at[] or .iat[] accessors instead", FutureWarning,
- stacklevel=2)
+ warnings.warn(
+ "get_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._get_value(index, col, takeable=takeable)
def _get_value(self, index, col, takeable=False):
@@ -425,6 +478,7 @@ def _get_value(self, index, col, takeable=False):
series = self._get_item_cache(col)
return series._get_value(index, takeable=takeable)
+
_get_value.__doc__ = get_value.__doc__
def set_value(self, index, col, value, takeable=False):
@@ -452,17 +506,21 @@ def set_value(self, index, col, value, takeable=False):
-------
frame : DataFrame
"""
- warnings.warn("set_value is deprecated and will be removed "
- "in a future release. Please use "
- ".at[] or .iat[] accessors instead", FutureWarning,
- stacklevel=2)
+ warnings.warn(
+ "set_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._set_value(index, col, value, takeable=takeable)
def _set_value(self, index, col, value, takeable=False):
- dense = self.to_dense()._set_value(
- index, col, value, takeable=takeable)
- return dense.to_sparse(kind=self._default_kind,
- fill_value=self._default_fill_value)
+ dense = self.to_dense()._set_value(index, col, value, takeable=takeable)
+ return dense.to_sparse(
+ kind=self._default_kind, fill_value=self._default_fill_value
+ )
+
_set_value.__doc__ = set_value.__doc__
def _slice(self, slobj, axis=0, kind=None):
@@ -503,7 +561,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None):
if level is not None:
raise NotImplementedError("'level' argument is not supported")
- this, other = self.align(other, join='outer', level=level, copy=False)
+ this, other = self.align(other, join="outer", level=level, copy=False)
new_index, new_columns = this.index, this.columns
if self.empty and other.empty:
@@ -527,10 +585,12 @@ def _combine_frame(self, other, func, fill_value=None, level=None):
new_fill_value = self._get_op_result_fill_value(other, func)
- return self._constructor(data=new_data, index=new_index,
- columns=new_columns,
- default_fill_value=new_fill_value
- ).__finalize__(self)
+ return self._constructor(
+ data=new_data,
+ index=new_index,
+ columns=new_columns,
+ default_fill_value=new_fill_value,
+ ).__finalize__(self)
def _combine_match_index(self, other, func, level=None):
new_data = {}
@@ -538,8 +598,7 @@ def _combine_match_index(self, other, func, level=None):
if level is not None:
raise NotImplementedError("'level' argument is not supported")
- this, other = self.align(other, join='outer', axis=0, level=level,
- copy=False)
+ this, other = self.align(other, join="outer", axis=0, level=level, copy=False)
for col, series in this.items():
new_data[col] = func(series.values, other.values)
@@ -547,8 +606,11 @@ def _combine_match_index(self, other, func, level=None):
fill_value = self._get_op_result_fill_value(other, func)
return self._constructor(
- new_data, index=this.index, columns=self.columns,
- default_fill_value=fill_value).__finalize__(self)
+ new_data,
+ index=this.index,
+ columns=self.columns,
+ default_fill_value=fill_value,
+ ).__finalize__(self)
def _combine_match_columns(self, other, func, level=None):
# patched version of DataFrame._combine_match_columns to account for
@@ -559,8 +621,7 @@ def _combine_match_columns(self, other, func, level=None):
if level is not None:
raise NotImplementedError("'level' argument is not supported")
- left, right = self.align(other, join='outer', axis=1, level=level,
- copy=False)
+ left, right = self.align(other, join="outer", axis=1, level=level, copy=False)
assert left.columns.equals(right.index)
new_data = {}
@@ -569,8 +630,11 @@ def _combine_match_columns(self, other, func, level=None):
new_data[col] = func(left[col], float(right[col]))
return self._constructor(
- new_data, index=left.index, columns=left.columns,
- default_fill_value=self.default_fill_value).__finalize__(self)
+ new_data,
+ index=left.index,
+ columns=left.columns,
+ default_fill_value=self.default_fill_value,
+ ).__finalize__(self)
def _combine_const(self, other, func):
return self._apply_columns(lambda x: func(x, other))
@@ -581,7 +645,7 @@ def _get_op_result_fill_value(self, other, func):
if isinstance(other, DataFrame):
# i.e. called from _combine_frame
- other_default = getattr(other, 'default_fill_value', np.nan)
+ other_default = getattr(other, "default_fill_value", np.nan)
# if the fill values are the same use them? or use a valid one
if own_default == other_default:
@@ -601,18 +665,18 @@ def _get_op_result_fill_value(self, other, func):
if isna(other.fill_value) or isna(own_default):
fill_value = np.nan
else:
- fill_value = func(np.float64(own_default),
- np.float64(other.fill_value))
+ fill_value = func(np.float64(own_default), np.float64(other.fill_value))
else:
raise NotImplementedError(type(other))
return fill_value
- def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
- limit=None, takeable=False):
+ def _reindex_index(
+ self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False
+ ):
if level is not None:
- raise TypeError('Reindex by level not supported for sparse')
+ raise TypeError("Reindex by level not supported for sparse")
if self.index.equals(index):
if copy:
@@ -621,8 +685,9 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
return self
if len(self.index) == 0:
- return self._constructor(
- index=index, columns=self.columns).__finalize__(self)
+ return self._constructor(index=index, columns=self.columns).__finalize__(
+ self
+ )
indexer = self.index.get_indexer(index, method, limit=limit)
indexer = ensure_platform_int(indexer)
@@ -647,13 +712,17 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
new_series[col] = new
return self._constructor(
- new_series, index=index, columns=self.columns,
- default_fill_value=self._default_fill_value).__finalize__(self)
+ new_series,
+ index=index,
+ columns=self.columns,
+ default_fill_value=self._default_fill_value,
+ ).__finalize__(self)
- def _reindex_columns(self, columns, method, copy, level, fill_value=None,
- limit=None, takeable=False):
+ def _reindex_columns(
+ self, columns, method, copy, level, fill_value=None, limit=None, takeable=False
+ ):
if level is not None:
- raise TypeError('Reindex by level not supported for sparse')
+ raise TypeError("Reindex by level not supported for sparse")
if notna(fill_value):
raise NotImplementedError("'fill_value' argument is not supported")
@@ -667,21 +736,31 @@ def _reindex_columns(self, columns, method, copy, level, fill_value=None,
# TODO: fill value handling
sdict = {k: v for k, v in self.items() if k in columns}
return self._constructor(
- sdict, index=self.index, columns=columns,
- default_fill_value=self._default_fill_value).__finalize__(self)
-
- def _reindex_with_indexers(self, reindexers, method=None, fill_value=None,
- limit=None, copy=False, allow_dups=False):
+ sdict,
+ index=self.index,
+ columns=columns,
+ default_fill_value=self._default_fill_value,
+ ).__finalize__(self)
+
+ def _reindex_with_indexers(
+ self,
+ reindexers,
+ method=None,
+ fill_value=None,
+ limit=None,
+ copy=False,
+ allow_dups=False,
+ ):
if method is not None or limit is not None:
- raise NotImplementedError("cannot reindex with a method or limit "
- "with sparse")
+ raise NotImplementedError(
+ "cannot reindex with a method or limit " "with sparse"
+ )
if fill_value is None:
fill_value = np.nan
- reindexers = {self._get_axis_number(a): val
- for (a, val) in reindexers.items()}
+ reindexers = {self._get_axis_number(a): val for (a, val) in reindexers.items()}
index, row_indexer = reindexers.get(0, (None, None))
columns, col_indexer = reindexers.get(1, (None, None))
@@ -695,30 +774,32 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=None,
continue
if row_indexer is not None:
new_arrays[col] = algos.take_1d(
- self[col]._internal_get_values(),
- row_indexer,
- fill_value=fill_value)
+ self[col]._internal_get_values(), row_indexer, fill_value=fill_value
+ )
else:
new_arrays[col] = self[col]
- return self._constructor(new_arrays, index=index,
- columns=columns).__finalize__(self)
+ return self._constructor(new_arrays, index=index, columns=columns).__finalize__(
+ self
+ )
- def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
- sort=False):
+ def _join_compat(
+ self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
+ ):
if on is not None:
- raise NotImplementedError("'on' keyword parameter is not yet "
- "implemented")
+ raise NotImplementedError(
+ "'on' keyword parameter is not yet " "implemented"
+ )
return self._join_index(other, how, lsuffix, rsuffix)
def _join_index(self, other, how, lsuffix, rsuffix):
if isinstance(other, Series):
if other.name is None:
- raise ValueError('Other Series must have a name')
+ raise ValueError("Other Series must have a name")
other = SparseDataFrame(
- {other.name: other},
- default_fill_value=self._default_fill_value)
+ {other.name: other}, default_fill_value=self._default_fill_value
+ )
join_index = self.index.join(other.index, how=how)
@@ -728,23 +809,26 @@ def _join_index(self, other, how, lsuffix, rsuffix):
this, other = this._maybe_rename_join(other, lsuffix, rsuffix)
from pandas import concat
+
return concat([this, other], axis=1, verify_integrity=True)
def _maybe_rename_join(self, other, lsuffix, rsuffix):
to_rename = self.columns.intersection(other.columns)
if len(to_rename) > 0:
if not lsuffix and not rsuffix:
- raise ValueError('columns overlap but no suffix specified: '
- '{to_rename}'.format(to_rename=to_rename))
+ raise ValueError(
+ "columns overlap but no suffix specified: "
+ "{to_rename}".format(to_rename=to_rename)
+ )
def lrenamer(x):
if x in to_rename:
- return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix)
+ return "{x}{lsuffix}".format(x=x, lsuffix=lsuffix)
return x
def rrenamer(x):
if x in to_rename:
- return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix)
+ return "{x}{rsuffix}".format(x=x, rsuffix=rsuffix)
return x
this = self.rename(columns=lrenamer)
@@ -760,9 +844,12 @@ def transpose(self, *args, **kwargs):
"""
nv.validate_transpose(args, kwargs)
return self._constructor(
- self.values.T, index=self.columns, columns=self.index,
+ self.values.T,
+ index=self.columns,
+ columns=self.index,
default_fill_value=self._default_fill_value,
- default_kind=self._default_kind).__finalize__(self)
+ default_kind=self._default_kind,
+ ).__finalize__(self)
T = property(transpose)
@@ -793,18 +880,19 @@ def cumsum(self, axis=0, *args, **kwargs):
return self.apply(lambda x: x.cumsum(), axis=axis)
- @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
+ @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs)
def isna(self):
return self._apply_columns(lambda x: x.isna())
+
isnull = isna
- @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)
+ @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs)
def notna(self):
return self._apply_columns(lambda x: x.notna())
+
notnull = notna
- def apply(self, func, axis=0, broadcast=None, reduce=None,
- result_type=None):
+ def apply(self, func, axis=0, broadcast=None, reduce=None, result_type=None):
"""
Analogous to DataFrame.apply, for SparseDataFrame
@@ -865,17 +953,23 @@ def apply(self, func, axis=0, broadcast=None, reduce=None,
applied.fill_value = func(v.fill_value)
new_series[k] = applied
return self._constructor(
- new_series, index=self.index, columns=self.columns,
+ new_series,
+ index=self.index,
+ columns=self.columns,
default_fill_value=self._default_fill_value,
- default_kind=self._default_kind).__finalize__(self)
+ default_kind=self._default_kind,
+ ).__finalize__(self)
from pandas.core.apply import frame_apply
- op = frame_apply(self,
- func=func,
- axis=axis,
- reduce=reduce,
- broadcast=broadcast,
- result_type=result_type)
+
+ op = frame_apply(
+ self,
+ func=func,
+ axis=axis,
+ reduce=reduce,
+ broadcast=broadcast,
+ result_type=result_type,
+ )
return op.get_result()
def applymap(self, func):
@@ -904,8 +998,7 @@ def to_manager(sdf, columns, index):
# from BlockManager perspective
axes = [ensure_index(columns), ensure_index(index)]
- return create_block_manager_from_arrays(
- [sdf[c] for c in columns], columns, axes)
+ return create_block_manager_from_arrays([sdf[c] for c in columns], columns, axes)
def stack_sparse_frame(frame):
@@ -925,7 +1018,7 @@ def stack_sparse_frame(frame):
# SparseDataFrame with a non-np.NaN fill value (fails earlier).
for _, series in frame.items():
if not np.isnan(series.fill_value):
- raise TypeError('This routine assumes NaN fill value')
+ raise TypeError("This routine assumes NaN fill value")
int_index = series.sp_index.to_int_index()
inds_to_concat.append(int_index.indices)
@@ -933,12 +1026,13 @@ def stack_sparse_frame(frame):
major_codes = np.concatenate(inds_to_concat)
stacked_values = np.concatenate(vals_to_concat)
- index = MultiIndex(levels=[frame.index, frame.columns],
- codes=[major_codes, minor_codes],
- verify_integrity=False)
+ index = MultiIndex(
+ levels=[frame.index, frame.columns],
+ codes=[major_codes, minor_codes],
+ verify_integrity=False,
+ )
- lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index,
- columns=['foo'])
+ lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, columns=["foo"])
return lp.sort_index(level=0)
@@ -966,7 +1060,7 @@ def homogenize(series_dict):
for _, series in series_dict.items():
if not np.isnan(series.fill_value):
- raise TypeError('this method is only valid with NaN fill values')
+ raise TypeError("this method is only valid with NaN fill values")
if index is None:
index = series.sp_index
diff --git a/pandas/core/sparse/scipy_sparse.py b/pandas/core/sparse/scipy_sparse.py
index 7ff0f46575661..73638f5965119 100644
--- a/pandas/core/sparse/scipy_sparse.py
+++ b/pandas/core/sparse/scipy_sparse.py
@@ -13,13 +13,12 @@ def _check_is_partition(parts, whole):
whole = set(whole)
parts = [set(x) for x in parts]
if set.intersection(*parts) != set():
- raise ValueError(
- 'Is not a partition because intersection is not null.')
+ raise ValueError("Is not a partition because intersection is not null.")
if set.union(*parts) != whole:
- raise ValueError('Is not a partition because union is not the whole.')
+ raise ValueError("Is not a partition because union is not the whole.")
-def _to_ijv(ss, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
+def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False):
""" For arbitrary (MultiIndexed) SparseSeries return
(v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for
passing to scipy.sparse.coo constructor. """
@@ -36,8 +35,7 @@ def get_indexers(levels):
# TODO: how to do this better? cleanly slice nonnull_labels given the
# coord
- values_ilabels = [tuple(x[i] for i in levels)
- for x in nonnull_labels.index]
+ values_ilabels = [tuple(x[i] for i in levels) for x in nonnull_labels.index]
if len(levels) == 1:
values_ilabels = [x[0] for x in values_ilabels]
@@ -55,12 +53,11 @@ def _get_label_to_i_dict(labels, sort_labels=False):
if sort_labels:
labels = sorted(list(labels))
d = OrderedDict((k, i) for i, k in enumerate(labels))
- return (d)
+ return d
def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
ilabels = list(zip(*[index._get_level_values(i) for i in subset]))
- labels_to_i = _get_label_to_i_dict(ilabels,
- sort_labels=sort_labels)
+ labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels)
labels_to_i = Series(labels_to_i)
if len(subset) > 1:
labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index)
@@ -69,11 +66,12 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
labels_to_i.index = Index(x[0] for x in labels_to_i.index)
labels_to_i.index.name = index.names[subset[0]]
- labels_to_i.name = 'value'
- return (labels_to_i)
+ labels_to_i.name = "value"
+ return labels_to_i
- labels_to_i = _get_index_subset_to_coord_dict(ss.index, levels,
- sort_labels=sort_labels)
+ labels_to_i = _get_index_subset_to_coord_dict(
+ ss.index, levels, sort_labels=sort_labels
+ )
# #####################################################################
# #####################################################################
@@ -88,8 +86,7 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
return values, i_coord, j_coord, i_labels, j_labels
-def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ),
- sort_labels=False):
+def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False):
"""
Convert a SparseSeries to a scipy.sparse.coo_matrix using index
levels row_levels, column_levels as the row and column
@@ -99,25 +96,26 @@ def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ),
import scipy.sparse
if ss.index.nlevels < 2:
- raise ValueError('to_coo requires MultiIndex with nlevels > 2')
+ raise ValueError("to_coo requires MultiIndex with nlevels > 2")
if not ss.index.is_unique:
- raise ValueError('Duplicate index entries are not allowed in to_coo '
- 'transformation.')
+ raise ValueError(
+ "Duplicate index entries are not allowed in to_coo " "transformation."
+ )
# to keep things simple, only rely on integer indexing (not labels)
row_levels = [ss.index._get_level_number(x) for x in row_levels]
column_levels = [ss.index._get_level_number(x) for x in column_levels]
- v, i, j, rows, columns = _to_ijv(ss, row_levels=row_levels,
- column_levels=column_levels,
- sort_labels=sort_labels)
+ v, i, j, rows, columns = _to_ijv(
+ ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels
+ )
sparse_matrix = scipy.sparse.coo_matrix(
- (v, (i, j)), shape=(len(rows), len(columns)))
+ (v, (i, j)), shape=(len(rows), len(columns))
+ )
return sparse_matrix, rows, columns
-def _coo_to_sparse_series(A, dense_index: bool = False,
- sparse_series: bool = True):
+def _coo_to_sparse_series(A, dense_index: bool = False, sparse_series: bool = True):
"""
Convert a scipy.sparse.coo_matrix to a SparseSeries.
@@ -141,8 +139,7 @@ def _coo_to_sparse_series(A, dense_index: bool = False,
try:
s = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
except AttributeError:
- raise TypeError('Expected coo_matrix. Got {} instead.'
- .format(type(A).__name__))
+ raise TypeError("Expected coo_matrix. Got {} instead.".format(type(A).__name__))
s = s.sort_index()
if sparse_series:
# TODO(SparseSeries): remove this and the sparse_series keyword.
diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py
index 88b6634db92b6..43f2609f46bd6 100644
--- a/pandas/core/sparse/series.py
+++ b/pandas/core/sparse/series.py
@@ -24,12 +24,15 @@
from pandas.core.internals import SingleBlockManager
import pandas.core.ops as ops
from pandas.core.series import Series
-from pandas.core.sparse.scipy_sparse import (
- _coo_to_sparse_series, _sparse_series_to_coo)
+from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series, _sparse_series_to_coo
-_shared_doc_kwargs = dict(axes='index', klass='SparseSeries',
- axes_single_arg="{0, 'index'}",
- optional_labels='', optional_axis='')
+_shared_doc_kwargs = dict(
+ axes="index",
+ klass="SparseSeries",
+ axes_single_arg="{0, 'index'}",
+ optional_labels="",
+ optional_axis="",
+)
depr_msg = """\
@@ -70,11 +73,21 @@ class SparseSeries(Series):
must change values, convert to dense, make your changes, then convert back
to sparse
"""
- _subtyp = 'sparse_series'
- def __init__(self, data=None, index=None, sparse_index=None, kind='block',
- fill_value=None, name=None, dtype=None, copy=False,
- fastpath=False):
+ _subtyp = "sparse_series"
+
+ def __init__(
+ self,
+ data=None,
+ index=None,
+ sparse_index=None,
+ kind="block",
+ fill_value=None,
+ name=None,
+ dtype=None,
+ copy=False,
+ fastpath=False,
+ ):
warnings.warn(depr_msg, FutureWarning, stacklevel=2)
# TODO: Most of this should be refactored and shared with Series
# 1. BlockManager -> array
@@ -102,55 +115,67 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
data = np.full(len(index), fill_value=data)
super().__init__(
- SparseArray(data,
- sparse_index=sparse_index,
- kind=kind,
- dtype=dtype,
- fill_value=fill_value,
- copy=copy),
- index=index, name=name,
- copy=False, fastpath=fastpath
+ SparseArray(
+ data,
+ sparse_index=sparse_index,
+ kind=kind,
+ dtype=dtype,
+ fill_value=fill_value,
+ copy=copy,
+ ),
+ index=index,
+ name=name,
+ copy=False,
+ fastpath=fastpath,
)
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# avoid infinite recursion for other SparseSeries inputs
- inputs = tuple(
- x.values if isinstance(x, type(self)) else x
- for x in inputs
- )
+ inputs = tuple(x.values if isinstance(x, type(self)) else x for x in inputs)
result = self.values.__array_ufunc__(ufunc, method, *inputs, **kwargs)
- return self._constructor(result, index=self.index,
- sparse_index=self.sp_index,
- fill_value=result.fill_value,
- copy=False).__finalize__(self)
+ return self._constructor(
+ result,
+ index=self.index,
+ sparse_index=self.sp_index,
+ fill_value=result.fill_value,
+ copy=False,
+ ).__finalize__(self)
# unary ops
# TODO: See if this can be shared
def __pos__(self):
result = self.values.__pos__()
- return self._constructor(result, index=self.index,
- sparse_index=self.sp_index,
- fill_value=result.fill_value,
- copy=False).__finalize__(self)
+ return self._constructor(
+ result,
+ index=self.index,
+ sparse_index=self.sp_index,
+ fill_value=result.fill_value,
+ copy=False,
+ ).__finalize__(self)
def __neg__(self):
result = self.values.__neg__()
- return self._constructor(result, index=self.index,
- sparse_index=self.sp_index,
- fill_value=result.fill_value,
- copy=False).__finalize__(self)
+ return self._constructor(
+ result,
+ index=self.index,
+ sparse_index=self.sp_index,
+ fill_value=result.fill_value,
+ copy=False,
+ ).__finalize__(self)
def __invert__(self):
result = self.values.__invert__()
- return self._constructor(result, index=self.index,
- sparse_index=self.sp_index,
- fill_value=result.fill_value,
- copy=False).__finalize__(self)
+ return self._constructor(
+ result,
+ index=self.index,
+ sparse_index=self.sp_index,
+ fill_value=result.fill_value,
+ copy=False,
+ ).__finalize__(self)
@property
def block(self):
- warnings.warn("SparseSeries.block is deprecated.", FutureWarning,
- stacklevel=2)
+ warnings.warn("SparseSeries.block is deprecated.", FutureWarning, stacklevel=2)
return self._data._block
@property
@@ -174,18 +199,29 @@ def npoints(self):
return self.values.npoints
@classmethod
- def from_array(cls, arr, index=None, name=None, copy=False,
- fill_value=None, fastpath=False):
+ def from_array(
+ cls, arr, index=None, name=None, copy=False, fill_value=None, fastpath=False
+ ):
"""Construct SparseSeries from array.
.. deprecated:: 0.23.0
Use the pd.SparseSeries(..) constructor instead.
"""
- warnings.warn("'from_array' is deprecated and will be removed in a "
- "future version. Please use the pd.SparseSeries(..) "
- "constructor instead.", FutureWarning, stacklevel=2)
- return cls(arr, index=index, name=name, copy=copy,
- fill_value=fill_value, fastpath=fastpath)
+ warnings.warn(
+ "'from_array' is deprecated and will be removed in a "
+ "future version. Please use the pd.SparseSeries(..) "
+ "constructor instead.",
+ FutureWarning,
+ stacklevel=2,
+ )
+ return cls(
+ arr,
+ index=index,
+ name=name,
+ copy=copy,
+ fill_value=fill_value,
+ fastpath=fastpath,
+ )
@property
def _constructor(self):
@@ -194,14 +230,15 @@ def _constructor(self):
@property
def _constructor_expanddim(self):
from pandas.core.sparse.api import SparseDataFrame
+
return SparseDataFrame
@property
def kind(self):
if isinstance(self.sp_index, BlockIndex):
- return 'block'
+ return "block"
elif isinstance(self.sp_index, IntIndex):
- return 'integer'
+ return "integer"
def as_sparse_array(self, kind=None, fill_value=None, copy=False):
""" return my self as a sparse array, do not copy by default """
@@ -210,26 +247,36 @@ def as_sparse_array(self, kind=None, fill_value=None, copy=False):
fill_value = self.fill_value
if kind is None:
kind = self.kind
- return SparseArray(self.values, sparse_index=self.sp_index,
- fill_value=fill_value, kind=kind, copy=copy)
+ return SparseArray(
+ self.values,
+ sparse_index=self.sp_index,
+ fill_value=fill_value,
+ kind=kind,
+ copy=copy,
+ )
def __repr__(self):
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Sparse")
series_rep = Series.__repr__(self)
- rep = '{series}\n{index!r}'.format(series=series_rep,
- index=self.sp_index)
+ rep = "{series}\n{index!r}".format(series=series_rep, index=self.sp_index)
return rep
- def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
- filter_type=None, **kwds):
+ def _reduce(
+ self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
+ ):
""" perform a reduction operation """
return op(self.array.to_dense(), skipna=skipna, **kwds)
def __getstate__(self):
# pickling
- return dict(_typ=self._typ, _subtyp=self._subtyp, _data=self._data,
- fill_value=self.fill_value, name=self.name)
+ return dict(
+ _typ=self._typ,
+ _subtyp=self._subtyp,
+ _data=self._data,
+ fill_value=self.fill_value,
+ name=self.name,
+ )
def _unpickle_series_compat(self, state):
@@ -246,8 +293,9 @@ def _unpickle_series_compat(self, state):
# create a sparse array
if not isinstance(data, SparseArray):
- data = SparseArray(data, sparse_index=sp_index,
- fill_value=fill_value, copy=False)
+ data = SparseArray(
+ data, sparse_index=sp_index, fill_value=fill_value, copy=False
+ )
# recreate
data = SingleBlockManager(data, index, fastpath=True)
@@ -258,9 +306,9 @@ def _unpickle_series_compat(self, state):
def _set_subtyp(self, is_all_dates):
if is_all_dates:
- object.__setattr__(self, '_subtyp', 'sparse_time_series')
+ object.__setattr__(self, "_subtyp", "sparse_time_series")
else:
- object.__setattr__(self, '_subtyp', 'sparse_series')
+ object.__setattr__(self, "_subtyp", "sparse_series")
def _ixs(self, i, axis=0):
"""
@@ -294,8 +342,9 @@ def __getitem__(self, key):
def _get_values(self, indexer):
try:
- return self._constructor(self._data.get_slice(indexer),
- fastpath=True).__finalize__(self)
+ return self._constructor(
+ self._data.get_slice(indexer), fastpath=True
+ ).__finalize__(self)
except Exception:
return self[indexer]
@@ -311,8 +360,9 @@ def abs(self):
-------
abs: same type as caller
"""
- return self._constructor(np.abs(self.values),
- index=self.index).__finalize__(self)
+ return self._constructor(np.abs(self.values), index=self.index).__finalize__(
+ self
+ )
def get(self, label, default=None):
"""
@@ -353,16 +403,20 @@ def get_value(self, label, takeable=False):
-------
value : scalar value
"""
- warnings.warn("get_value is deprecated and will be removed "
- "in a future release. Please use "
- ".at[] or .iat[] accessors instead", FutureWarning,
- stacklevel=2)
+ warnings.warn(
+ "get_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._get_value(label, takeable=takeable)
def _get_value(self, label, takeable=False):
loc = label if takeable is True else self.index.get_loc(label)
return self._get_val_at(loc)
+
_get_value.__doc__ = get_value.__doc__
def set_value(self, label, value, takeable=False):
@@ -392,10 +446,13 @@ def set_value(self, label, value, takeable=False):
-------
series : SparseSeries
"""
- warnings.warn("set_value is deprecated and will be removed "
- "in a future release. Please use "
- ".at[] or .iat[] accessors instead", FutureWarning,
- stacklevel=2)
+ warnings.warn(
+ "set_value is deprecated and will be removed "
+ "in a future release. Please use "
+ ".at[] or .iat[] accessors instead",
+ FutureWarning,
+ stacklevel=2,
+ )
return self._set_value(label, value, takeable=takeable)
def _set_value(self, label, value, takeable=False):
@@ -407,10 +464,10 @@ def _set_value(self, label, value, takeable=False):
if new_values is not None:
values = new_values
new_index = values.index
- values = SparseArray(values, fill_value=self.fill_value,
- kind=self.kind)
+ values = SparseArray(values, fill_value=self.fill_value, kind=self.kind)
self._data = SingleBlockManager(values, new_index)
self._index = new_index
+
_set_value.__doc__ = set_value.__doc__
def _set_values(self, key, value):
@@ -424,8 +481,7 @@ def _set_values(self, key, value):
values = self.values.to_dense()
values[key] = libindex.convert_scalar(values, value)
- values = SparseArray(values, fill_value=self.fill_value,
- kind=self.kind)
+ values = SparseArray(values, fill_value=self.fill_value, kind=self.kind)
self._data = SingleBlockManager(values, self.index)
def to_dense(self):
@@ -436,8 +492,7 @@ def to_dense(self):
-------
s : Series
"""
- return Series(self.values.to_dense(), index=self.index,
- name=self.name)
+ return Series(self.values.to_dense(), index=self.index, name=self.name)
@property
def density(self):
@@ -453,18 +508,21 @@ def copy(self, deep=True):
new_data = self.values
if deep:
new_data = new_data.copy()
- return self._constructor(new_data, sparse_index=self.sp_index,
- fill_value=self.fill_value,
- index=self.index.copy(),
- name=self.name).__finalize__(self)
+ return self._constructor(
+ new_data,
+ sparse_index=self.sp_index,
+ fill_value=self.fill_value,
+ index=self.index.copy(),
+ name=self.name,
+ ).__finalize__(self)
@Substitution(**_shared_doc_kwargs)
@Appender(generic.NDFrame.reindex.__doc__)
- def reindex(self, index=None, method=None, copy=True, limit=None,
- **kwargs):
+ def reindex(self, index=None, method=None, copy=True, limit=None, **kwargs):
# TODO: remove?
- return super().reindex(index=index, method=method, copy=copy,
- limit=limit, **kwargs)
+ return super().reindex(
+ index=index, method=method, copy=copy, limit=limit, **kwargs
+ )
def sparse_reindex(self, new_index):
"""
@@ -482,10 +540,11 @@ def sparse_reindex(self, new_index):
raise TypeError("new index must be a SparseIndex")
values = self.values
values = values.sp_index.to_int_index().reindex(
- values.sp_values.astype('float64'), values.fill_value, new_index)
- values = SparseArray(values,
- sparse_index=new_index,
- fill_value=self.values.fill_value)
+ values.sp_values.astype("float64"), values.fill_value, new_index
+ )
+ values = SparseArray(
+ values, sparse_index=new_index, fill_value=self.values.fill_value
+ )
return self._constructor(values, index=self.index).__finalize__(self)
def cumsum(self, axis=0, *args, **kwargs):
@@ -512,25 +571,30 @@ def cumsum(self, axis=0, *args, **kwargs):
new_array = self.values.cumsum()
return self._constructor(
- new_array, index=self.index,
- sparse_index=new_array.sp_index).__finalize__(self)
+ new_array, index=self.index, sparse_index=new_array.sp_index
+ ).__finalize__(self)
# TODO: SparseSeries.isna is Sparse, while Series.isna is dense
- @Appender(generic._shared_docs['isna'] % _shared_doc_kwargs)
+ @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs)
def isna(self):
- arr = SparseArray(isna(self.values.sp_values),
- sparse_index=self.values.sp_index,
- fill_value=isna(self.fill_value))
+ arr = SparseArray(
+ isna(self.values.sp_values),
+ sparse_index=self.values.sp_index,
+ fill_value=isna(self.fill_value),
+ )
return self._constructor(arr, index=self.index).__finalize__(self)
isnull = isna
- @Appender(generic._shared_docs['notna'] % _shared_doc_kwargs)
+ @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs)
def notna(self):
- arr = SparseArray(notna(self.values.sp_values),
- sparse_index=self.values.sp_index,
- fill_value=notna(self.fill_value))
+ arr = SparseArray(
+ notna(self.values.sp_values),
+ sparse_index=self.values.sp_index,
+ fill_value=notna(self.fill_value),
+ )
return self._constructor(arr, index=self.index).__finalize__(self)
+
notnull = notna
def dropna(self, axis=0, inplace=False, **kwargs):
@@ -542,8 +606,9 @@ def dropna(self, axis=0, inplace=False, **kwargs):
self._get_axis_number(axis or 0)
dense_valid = self.to_dense().dropna()
if inplace:
- raise NotImplementedError("Cannot perform inplace dropna"
- " operations on a SparseSeries")
+ raise NotImplementedError(
+ "Cannot perform inplace dropna" " operations on a SparseSeries"
+ )
if isna(self.fill_value):
return dense_valid
else:
@@ -570,10 +635,10 @@ def combine_first(self, other):
return dense_combined.to_sparse(fill_value=self.fill_value)
@Appender(SparseAccessor.to_coo.__doc__)
- def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
- A, rows, columns = _sparse_series_to_coo(self, row_levels,
- column_levels,
- sort_labels=sort_labels)
+ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
+ A, rows, columns = _sparse_series_to_coo(
+ self, row_levels, column_levels, sort_labels=sort_labels
+ )
return A, rows, columns
@classmethod
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 710b29c6a6536..70700653c4795 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -12,8 +12,15 @@
from pandas.util._decorators import Appender, deprecate_kwarg
from pandas.core.dtypes.common import (
- ensure_object, is_bool_dtype, is_categorical_dtype, is_integer,
- is_list_like, is_re, is_scalar, is_string_like)
+ ensure_object,
+ is_bool_dtype,
+ is_categorical_dtype,
+ is_integer,
+ is_list_like,
+ is_re,
+ is_scalar,
+ is_string_like,
+)
from pandas.core.dtypes.generic import ABCIndexClass, ABCMultiIndex, ABCSeries
from pandas.core.dtypes.missing import isna
@@ -22,11 +29,15 @@
import pandas.core.common as com
_cpython_optimized_encoders = (
- "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii"
-)
-_cpython_optimized_decoders = _cpython_optimized_encoders + (
- "utf-16", "utf-32"
+ "utf-8",
+ "utf8",
+ "latin-1",
+ "latin1",
+ "iso-8859-1",
+ "mbcs",
+ "ascii",
)
+_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32")
_shared_docs = dict() # type: Dict[str, str]
@@ -80,11 +91,12 @@ def cat_safe(list_of_columns: List, sep: str):
# object dtype), np.sum will fail; catch and return with better message
for column in list_of_columns:
dtype = lib.infer_dtype(column, skipna=True)
- if dtype not in ['string', 'empty']:
+ if dtype not in ["string", "empty"]:
raise TypeError(
- 'Concatenation requires list-likes containing only '
- 'strings (or missing values). Offending values found in '
- 'column {}'.format(dtype)) from None
+ "Concatenation requires list-likes containing only "
+ "strings (or missing values). Offending values found in "
+ "column {}".format(dtype)
+ ) from None
return result
@@ -109,8 +121,10 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
except (TypeError, AttributeError) as e:
# Reraise the exception if callable `f` got wrong number of args.
# The user may want to be warned by this, instead of getting NaN
- p_err = (r'((takes)|(missing)) (?(2)from \d+ to )?\d+ '
- r'(?(3)required )positional arguments?')
+ p_err = (
+ r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
+ r"(?(3)required )positional arguments?"
+ )
if len(e.args) >= 1 and re.search(p_err, e.args[0]):
raise e
@@ -330,9 +344,12 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
regex = re.compile(pat, flags=flags)
if regex.groups > 0:
- warnings.warn("This pattern has match groups. To actually get the"
- " groups, use str.extract.", UserWarning,
- stacklevel=3)
+ warnings.warn(
+ "This pattern has match groups. To actually get the"
+ " groups, use str.extract.",
+ UserWarning,
+ stacklevel=3,
+ )
f = lambda x: bool(regex.search(x))
else:
@@ -585,8 +602,9 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
if regex:
if is_compiled_re:
if (case is not None) or (flags != 0):
- raise ValueError("case and flags cannot be set"
- " when pat is a compiled regex")
+ raise ValueError(
+ "case and flags cannot be set" " when pat is a compiled regex"
+ )
else:
# not a compiled regex
# set default case
@@ -604,11 +622,11 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
f = lambda x: x.replace(pat, repl, n)
else:
if is_compiled_re:
- raise ValueError("Cannot use a compiled regex as replacement "
- "pattern with regex=False")
+ raise ValueError(
+ "Cannot use a compiled regex as replacement " "pattern with regex=False"
+ )
if callable(repl):
- raise ValueError("Cannot use a callable replacement when "
- "regex=False")
+ raise ValueError("Cannot use a callable replacement when " "regex=False")
f = lambda x: x.replace(pat, repl, n)
return _na_map(f, arr)
@@ -655,6 +673,7 @@ def str_repeat(arr, repeats):
dtype: object
"""
if is_scalar(repeats):
+
def scalar_rep(x):
try:
return bytes.__mul__(x, repeats)
@@ -732,6 +751,7 @@ def f(x):
return [np.nan if item is None else item for item in m.groups()]
else:
return empty_row
+
return f
@@ -764,7 +784,8 @@ def _str_extract_noexpand(arr, pat, flags=0):
[groups_or_na(val) for val in arr],
columns=columns,
index=arr.index,
- dtype=object)
+ dtype=object,
+ )
return result, name
@@ -792,7 +813,8 @@ def _str_extract_frame(arr, pat, flags=0):
[groups_or_na(val) for val in arr],
columns=columns,
index=result_index,
- dtype=object)
+ dtype=object,
+ )
def str_extract(arr, pat, flags=0, expand=True):
@@ -980,27 +1002,25 @@ def str_extractall(arr, pat, flags=0):
if isinstance(subject, str):
if not is_mi:
- subject_key = (subject_key, )
+ subject_key = (subject_key,)
for match_i, match_tuple in enumerate(regex.findall(subject)):
if isinstance(match_tuple, str):
match_tuple = (match_tuple,)
- na_tuple = [np.NaN if group == "" else group
- for group in match_tuple]
+ na_tuple = [np.NaN if group == "" else group for group in match_tuple]
match_list.append(na_tuple)
- result_key = tuple(subject_key + (match_i, ))
+ result_key = tuple(subject_key + (match_i,))
index_list.append(result_key)
from pandas import MultiIndex
- index = MultiIndex.from_tuples(
- index_list, names=arr.index.names + ["match"])
- result = arr._constructor_expanddim(match_list, index=index,
- columns=columns)
+ index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
+
+ result = arr._constructor_expanddim(match_list, index=index, columns=columns)
return result
-def str_get_dummies(arr, sep='|'):
+def str_get_dummies(arr, sep="|"):
"""
Split each string in the Series by sep and return a DataFrame
of dummy/indicator variables.
@@ -1034,7 +1054,7 @@ def str_get_dummies(arr, sep='|'):
1 0 0 0
2 1 0 1
"""
- arr = arr.fillna('')
+ arr = arr.fillna("")
try:
arr = sep + arr + sep
except TypeError:
@@ -1212,7 +1232,7 @@ def str_findall(arr, pat, flags=0):
return _na_map(regex.findall, arr)
-def str_find(arr, sub, start=0, end=None, side='left'):
+def str_find(arr, sub, start=0, end=None, side="left"):
"""
Return indexes in each strings in the Series/Index where the
substring is fully contained between [start:end]. Return -1 on failure.
@@ -1235,15 +1255,15 @@ def str_find(arr, sub, start=0, end=None, side='left'):
"""
if not isinstance(sub, str):
- msg = 'expected a string object, not {0}'
+ msg = "expected a string object, not {0}"
raise TypeError(msg.format(type(sub).__name__))
- if side == 'left':
- method = 'find'
- elif side == 'right':
- method = 'rfind'
+ if side == "left":
+ method = "find"
+ elif side == "right":
+ method = "rfind"
else: # pragma: no cover
- raise ValueError('Invalid side')
+ raise ValueError("Invalid side")
if end is None:
f = lambda x: getattr(x, method)(sub, start)
@@ -1253,17 +1273,17 @@ def str_find(arr, sub, start=0, end=None, side='left'):
return _na_map(f, arr, dtype=int)
-def str_index(arr, sub, start=0, end=None, side='left'):
+def str_index(arr, sub, start=0, end=None, side="left"):
if not isinstance(sub, str):
- msg = 'expected a string object, not {0}'
+ msg = "expected a string object, not {0}"
raise TypeError(msg.format(type(sub).__name__))
- if side == 'left':
- method = 'index'
- elif side == 'right':
- method = 'rindex'
+ if side == "left":
+ method = "index"
+ elif side == "right":
+ method = "rindex"
else: # pragma: no cover
- raise ValueError('Invalid side')
+ raise ValueError("Invalid side")
if end is None:
f = lambda x: getattr(x, method)(sub, start)
@@ -1273,7 +1293,7 @@ def str_index(arr, sub, start=0, end=None, side='left'):
return _na_map(f, arr, dtype=int)
-def str_pad(arr, width, side='left', fillchar=' '):
+def str_pad(arr, width, side="left", fillchar=" "):
"""
Pad strings in the Series/Index up to width.
@@ -1327,24 +1347,24 @@ def str_pad(arr, width, side='left', fillchar=' '):
dtype: object
"""
if not isinstance(fillchar, str):
- msg = 'fillchar must be a character, not {0}'
+ msg = "fillchar must be a character, not {0}"
raise TypeError(msg.format(type(fillchar).__name__))
if len(fillchar) != 1:
- raise TypeError('fillchar must be a character, not str')
+ raise TypeError("fillchar must be a character, not str")
if not is_integer(width):
- msg = 'width must be of integer type, not {0}'
+ msg = "width must be of integer type, not {0}"
raise TypeError(msg.format(type(width).__name__))
- if side == 'left':
+ if side == "left":
f = lambda x: x.rjust(width, fillchar)
- elif side == 'right':
+ elif side == "right":
f = lambda x: x.ljust(width, fillchar)
- elif side == 'both':
+ elif side == "both":
f = lambda x: x.center(width, fillchar)
else: # pragma: no cover
- raise ValueError('Invalid side')
+ raise ValueError("Invalid side")
return _na_map(f, arr)
@@ -1522,14 +1542,14 @@ def str_slice_replace(arr, start=None, stop=None, repl=None):
dtype: object
"""
if repl is None:
- repl = ''
+ repl = ""
def f(x):
- if x[start:stop] == '':
+ if x[start:stop] == "":
local_stop = start
else:
local_stop = stop
- y = ''
+ y = ""
if start is not None:
y += x[:start]
y += repl
@@ -1540,7 +1560,7 @@ def f(x):
return _na_map(f, arr)
-def str_strip(arr, to_strip=None, side='both'):
+def str_strip(arr, to_strip=None, side="both"):
"""
Strip whitespace (including newlines) from each string in the
Series/Index.
@@ -1554,14 +1574,14 @@ def str_strip(arr, to_strip=None, side='both'):
-------
Series or Index
"""
- if side == 'both':
+ if side == "both":
f = lambda x: x.strip(to_strip)
- elif side == 'left':
+ elif side == "left":
f = lambda x: x.lstrip(to_strip)
- elif side == 'right':
+ elif side == "right":
f = lambda x: x.rstrip(to_strip)
else: # pragma: no cover
- raise ValueError('Invalid side')
+ raise ValueError("Invalid side")
return _na_map(f, arr)
@@ -1622,11 +1642,11 @@ def str_wrap(arr, width, **kwargs):
1 another line\nto be\nwrapped
dtype: object
"""
- kwargs['width'] = width
+ kwargs["width"] = width
tw = textwrap.TextWrapper(**kwargs)
- return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr)
+ return _na_map(lambda s: "\n".join(tw.wrap(s)), arr)
def str_translate(arr, table):
@@ -1700,12 +1720,14 @@ def str_get(arr, i):
5 None
dtype: object
"""
+
def f(x):
if isinstance(x, dict):
return x.get(i)
elif len(x) > i >= -len(x):
return x[i]
return np.nan
+
return _na_map(f, arr)
@@ -1801,8 +1823,9 @@ def forbid_nonstring_types(forbidden, name=None):
# deal with None
forbidden = [] if forbidden is None else forbidden
- allowed_types = {'string', 'empty', 'bytes',
- 'mixed', 'mixed-integer'} - set(forbidden)
+ allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set(
+ forbidden
+ )
def _forbid_nonstring_types(func):
func_name = func.__name__ if name is None else name
@@ -1810,18 +1833,22 @@ def _forbid_nonstring_types(func):
@wraps(func)
def wrapper(self, *args, **kwargs):
if self._inferred_dtype not in allowed_types:
- msg = ('Cannot use .str.{name} with values of inferred dtype '
- '{inf_type!r}.'.format(name=func_name,
- inf_type=self._inferred_dtype))
+ msg = (
+ "Cannot use .str.{name} with values of inferred dtype "
+ "{inf_type!r}.".format(
+ name=func_name, inf_type=self._inferred_dtype
+ )
+ )
raise TypeError(msg)
return func(self, *args, **kwargs)
+
wrapper.__name__ = func_name
return wrapper
+
return _forbid_nonstring_types
-def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=['bytes'],
- **kargs):
+def _noarg_wrapper(f, name=None, docstring=None, forbidden_types=["bytes"], **kargs):
@forbid_nonstring_types(forbidden_types, name=name)
def wrapper(self):
result = _na_map(f, self._parent, **kargs)
@@ -1831,13 +1858,14 @@ def wrapper(self):
if docstring is not None:
wrapper.__doc__ = docstring
else:
- raise ValueError('Provide docstring')
+ raise ValueError("Provide docstring")
return wrapper
-def _pat_wrapper(f, flags=False, na=False, name=None,
- forbidden_types=['bytes'], **kwargs):
+def _pat_wrapper(
+ f, flags=False, na=False, name=None, forbidden_types=["bytes"], **kwargs
+):
@forbid_nonstring_types(forbidden_types, name=name)
def wrapper1(self, pat):
result = f(self._parent, pat)
@@ -1919,21 +1947,21 @@ def _validate(data):
dtype : inferred dtype of data
"""
if isinstance(data, ABCMultiIndex):
- raise AttributeError('Can only use .str accessor with Index, '
- 'not MultiIndex')
+ raise AttributeError(
+ "Can only use .str accessor with Index, " "not MultiIndex"
+ )
# see _libs/lib.pyx for list of inferred types
- allowed_types = ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']
+ allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"]
- values = getattr(data, 'values', data) # Series / Index
- values = getattr(values, 'categories', values) # categorical / normal
+ values = getattr(data, "values", data) # Series / Index
+ values = getattr(values, "categories", values) # categorical / normal
# missing values obfuscate type inference -> skip
inferred_dtype = lib.infer_dtype(values, skipna=True)
if inferred_dtype not in allowed_types:
- raise AttributeError("Can only use .str accessor with string "
- "values!")
+ raise AttributeError("Can only use .str accessor with string " "values!")
return inferred_dtype
def __getitem__(self, key):
@@ -1950,8 +1978,9 @@ def __iter__(self):
i += 1
g = self.get(i)
- def _wrap_result(self, result, use_codes=True,
- name=None, expand=None, fill_value=np.nan):
+ def _wrap_result(
+ self, result, use_codes=True, name=None, expand=None, fill_value=np.nan
+ ):
from pandas import Index, Series, MultiIndex
@@ -1962,10 +1991,11 @@ def _wrap_result(self, result, use_codes=True,
# before the transformation...
if use_codes and self._is_categorical:
# if self._orig is a CategoricalIndex, there is no .cat-accessor
- result = take_1d(result, Series(self._orig, copy=False).cat.codes,
- fill_value=fill_value)
+ result = take_1d(
+ result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value
+ )
- if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'):
+ if not hasattr(result, "ndim") or not hasattr(result, "dtype"):
return result
assert result.ndim < 3
@@ -1987,8 +2017,9 @@ def cons_row(x):
if result:
# propagate nan values to match longest sequence (GH 18450)
max_len = max(len(x) for x in result)
- result = [x * max_len if len(x) == 0 or x[0] is np.nan
- else x for x in result]
+ result = [
+ x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result
+ ]
if not isinstance(expand, bool):
raise ValueError("expand must be True or False")
@@ -1997,7 +2028,7 @@ def cons_row(x):
# if expand is False, result should have the same name
# as the original otherwise specified
if name is None:
- name = getattr(result, 'name', None)
+ name = getattr(result, "name", None)
if name is None:
# do not use logical or, _orig may be a DataFrame
# which has "name" column
@@ -2058,9 +2089,11 @@ def _get_series_list(self, others, ignore_index=False):
# self._orig is either Series or Index
idx = self._orig if isinstance(self._orig, Index) else self._orig.index
- err_msg = ('others must be Series, Index, DataFrame, np.ndarray or '
- 'list-like (either containing only strings or containing '
- 'only objects of type Series/Index/list-like/np.ndarray)')
+ err_msg = (
+ "others must be Series, Index, DataFrame, np.ndarray or "
+ "list-like (either containing only strings or containing "
+ "only objects of type Series/Index/list-like/np.ndarray)"
+ )
# Generally speaking, all objects without an index inherit the index
# `idx` of the calling Series/Index - i.e. must have matching length.
@@ -2069,13 +2102,13 @@ def _get_series_list(self, others, ignore_index=False):
if isinstance(others, Series):
warn = not others.index.equals(idx)
# only reconstruct Series when absolutely necessary
- los = [Series(others.values, index=idx)
- if ignore_index and warn else others]
+ los = [
+ Series(others.values, index=idx) if ignore_index and warn else others
+ ]
return (los, warn)
elif isinstance(others, Index):
warn = not others.equals(idx)
- los = [Series(others.values,
- index=(idx if ignore_index else others))]
+ los = [Series(others.values, index=(idx if ignore_index else others))]
return (los, warn)
elif isinstance(others, DataFrame):
warn = not others.index.equals(idx)
@@ -2105,45 +2138,51 @@ def _get_series_list(self, others, ignore_index=False):
# GH 21950 - DeprecationWarning
# only allowing Series/Index/np.ndarray[1-dim] will greatly
# simply this function post-deprecation.
- if not (isinstance(nxt, (Series, Index)) or
- (isinstance(nxt, np.ndarray) and nxt.ndim == 1)):
+ if not (
+ isinstance(nxt, (Series, Index))
+ or (isinstance(nxt, np.ndarray) and nxt.ndim == 1)
+ ):
depr_warn = True
- if not isinstance(nxt, (DataFrame, Series,
- Index, np.ndarray)):
+ if not isinstance(nxt, (DataFrame, Series, Index, np.ndarray)):
# safety for non-persistent list-likes (e.g. iterators)
# do not map indexed/typed objects; info needed below
nxt = list(nxt)
# known types for which we can avoid deep inspection
- no_deep = ((isinstance(nxt, np.ndarray) and nxt.ndim == 1)
- or isinstance(nxt, (Series, Index)))
+ no_deep = (
+ isinstance(nxt, np.ndarray) and nxt.ndim == 1
+ ) or isinstance(nxt, (Series, Index))
# nested list-likes are forbidden:
# -> elements of nxt must not be list-like
- is_legal = ((no_deep and nxt.dtype == object)
- or all(not is_list_like(x) for x in nxt))
+ is_legal = (no_deep and nxt.dtype == object) or all(
+ not is_list_like(x) for x in nxt
+ )
# DataFrame is false positive of is_legal
# because "x in df" returns column names
if not is_legal or isinstance(nxt, DataFrame):
raise TypeError(err_msg)
- nxt, wnx = self._get_series_list(nxt,
- ignore_index=ignore_index)
+ nxt, wnx = self._get_series_list(nxt, ignore_index=ignore_index)
los = los + nxt
join_warn = join_warn or wnx
if depr_warn:
- warnings.warn('list-likes other than Series, Index, or '
- 'np.ndarray WITHIN another list-like are '
- 'deprecated and will be removed in a future '
- 'version.', FutureWarning, stacklevel=4)
+ warnings.warn(
+ "list-likes other than Series, Index, or "
+ "np.ndarray WITHIN another list-like are "
+ "deprecated and will be removed in a future "
+ "version.",
+ FutureWarning,
+ stacklevel=4,
+ )
return (los, join_warn)
elif all(not is_list_like(x) for x in others):
return ([Series(others, index=idx)], False)
raise TypeError(err_msg)
- @forbid_nonstring_types(['bytes', 'mixed', 'mixed-integer'])
+ @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"])
def cat(self, others=None, sep=None, na_rep=None, join=None):
"""
Concatenate strings in the Series/Index with given separator.
@@ -2284,7 +2323,7 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
if isinstance(others, str):
raise ValueError("Did you mean to supply a `sep` keyword?")
if sep is None:
- sep = ''
+ sep = ""
if isinstance(self._orig, Index):
data = Series(self._orig, index=self._orig)
@@ -2303,38 +2342,50 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
try:
# turn anything in "others" into lists of Series
- others, warn = self._get_series_list(others,
- ignore_index=(join is None))
+ others, warn = self._get_series_list(others, ignore_index=(join is None))
except ValueError: # do not catch TypeError raised by _get_series_list
if join is None:
- raise ValueError('All arrays must be same length, except '
- 'those having an index if `join` is not None')
+ raise ValueError(
+ "All arrays must be same length, except "
+ "those having an index if `join` is not None"
+ )
else:
- raise ValueError('If `others` contains arrays or lists (or '
- 'other list-likes without an index), these '
- 'must all be of the same length as the '
- 'calling Series/Index.')
+ raise ValueError(
+ "If `others` contains arrays or lists (or "
+ "other list-likes without an index), these "
+ "must all be of the same length as the "
+ "calling Series/Index."
+ )
if join is None and warn:
- warnings.warn("A future version of pandas will perform index "
- "alignment when `others` is a Series/Index/"
- "DataFrame (or a list-like containing one). To "
- "disable alignment (the behavior before v.0.23) and "
- "silence this warning, use `.values` on any Series/"
- "Index/DataFrame in `others`. To enable alignment "
- "and silence this warning, pass `join='left'|"
- "'outer'|'inner'|'right'`. The future default will "
- "be `join='left'`.", FutureWarning, stacklevel=3)
+ warnings.warn(
+ "A future version of pandas will perform index "
+ "alignment when `others` is a Series/Index/"
+ "DataFrame (or a list-like containing one). To "
+ "disable alignment (the behavior before v.0.23) and "
+ "silence this warning, use `.values` on any Series/"
+ "Index/DataFrame in `others`. To enable alignment "
+ "and silence this warning, pass `join='left'|"
+ "'outer'|'inner'|'right'`. The future default will "
+ "be `join='left'`.",
+ FutureWarning,
+ stacklevel=3,
+ )
# if join is None, _get_series_list already force-aligned indexes
- join = 'left' if join is None else join
+ join = "left" if join is None else join
# align if required
if any(not data.index.equals(x.index) for x in others):
# Need to add keys for uniqueness in case of duplicate columns
- others = concat(others, axis=1,
- join=(join if join == 'inner' else 'outer'),
- keys=range(len(others)), sort=False, copy=False)
+ others = concat(
+ others,
+ axis=1,
+ join=(join if join == "inner" else "outer"),
+ keys=range(len(others)),
+ sort=False,
+ copy=False,
+ )
data, others = data.align(others, join=join)
others = [others[x] for x in others] # again list of Series
@@ -2349,12 +2400,12 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
np.putmask(result, union_mask, np.nan)
not_masked = ~union_mask
- result[not_masked] = cat_safe([x[not_masked] for x in all_cols],
- sep)
+ result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep)
elif na_rep is not None and union_mask.any():
# fill NaNs with na_rep in case there are actually any NaNs
- all_cols = [np.where(nm, na_rep, col)
- for nm, col in zip(na_masks, all_cols)]
+ all_cols = [
+ np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)
+ ]
result = cat_safe(all_cols, sep)
else:
# no NaNs - can just concatenate
@@ -2364,11 +2415,14 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
# add dtype for case that result is all-NA
result = Index(result, dtype=object, name=self._orig.name)
else: # Series
- result = Series(result, dtype=object, index=data.index,
- name=self._orig.name)
+ result = Series(
+ result, dtype=object, index=data.index, name=self._orig.name
+ )
return result
- _shared_docs['str_split'] = (r"""
+ _shared_docs[
+ "str_split"
+ ] = r"""
Split strings around given separator/delimiter.
Splits the string in the Series/Index from the %(side)s,
@@ -2496,25 +2550,23 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
>>> s.str.split(r"\+|=", expand=True)
0 1 2
0 1 1 2
- """)
+ """
- @Appender(_shared_docs['str_split'] % {
- 'side': 'beginning',
- 'method': 'split'})
- @forbid_nonstring_types(['bytes'])
+ @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"})
+ @forbid_nonstring_types(["bytes"])
def split(self, pat=None, n=-1, expand=False):
result = str_split(self._parent, pat, n=n)
return self._wrap_result(result, expand=expand)
- @Appender(_shared_docs['str_split'] % {
- 'side': 'end',
- 'method': 'rsplit'})
- @forbid_nonstring_types(['bytes'])
+ @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"})
+ @forbid_nonstring_types(["bytes"])
def rsplit(self, pat=None, n=-1, expand=False):
result = str_rsplit(self._parent, pat, n=n)
return self._wrap_result(result, expand=expand)
- _shared_docs['str_partition'] = ("""
+ _shared_docs[
+ "str_partition"
+ ] = """
Split the string at the %(side)s occurrence of `sep`.
This method splits the string at the %(side)s occurrence of `sep`,
@@ -2595,32 +2647,36 @@ def rsplit(self, pat=None, n=-1, expand=False):
>>> idx.str.partition(expand=False)
Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')
- """)
-
- @Appender(_shared_docs['str_partition'] % {
- 'side': 'first',
- 'return': '3 elements containing the string itself, followed by two '
- 'empty strings',
- 'also': 'rpartition : Split the string at the last occurrence of '
- '`sep`.'
- })
- @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep')
- @forbid_nonstring_types(['bytes'])
- def partition(self, sep=' ', expand=True):
+ """
+
+ @Appender(
+ _shared_docs["str_partition"]
+ % {
+ "side": "first",
+ "return": "3 elements containing the string itself, followed by two "
+ "empty strings",
+ "also": "rpartition : Split the string at the last occurrence of " "`sep`.",
+ }
+ )
+ @deprecate_kwarg(old_arg_name="pat", new_arg_name="sep")
+ @forbid_nonstring_types(["bytes"])
+ def partition(self, sep=" ", expand=True):
f = lambda x: x.partition(sep)
result = _na_map(f, self._parent)
return self._wrap_result(result, expand=expand)
- @Appender(_shared_docs['str_partition'] % {
- 'side': 'last',
- 'return': '3 elements containing two empty strings, followed by the '
- 'string itself',
- 'also': 'partition : Split the string at the first occurrence of '
- '`sep`.'
- })
- @deprecate_kwarg(old_arg_name='pat', new_arg_name='sep')
- @forbid_nonstring_types(['bytes'])
- def rpartition(self, sep=' ', expand=True):
+ @Appender(
+ _shared_docs["str_partition"]
+ % {
+ "side": "last",
+ "return": "3 elements containing two empty strings, followed by the "
+ "string itself",
+ "also": "partition : Split the string at the first occurrence of " "`sep`.",
+ }
+ )
+ @deprecate_kwarg(old_arg_name="pat", new_arg_name="sep")
+ @forbid_nonstring_types(["bytes"])
+ def rpartition(self, sep=" ", expand=True):
f = lambda x: x.rpartition(sep)
result = _na_map(f, self._parent)
return self._wrap_result(result, expand=expand)
@@ -2631,44 +2687,48 @@ def get(self, i):
return self._wrap_result(result)
@copy(str_join)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def join(self, sep):
result = str_join(self._parent, sep)
return self._wrap_result(result)
@copy(str_contains)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
- result = str_contains(self._parent, pat, case=case, flags=flags, na=na,
- regex=regex)
+ result = str_contains(
+ self._parent, pat, case=case, flags=flags, na=na, regex=regex
+ )
return self._wrap_result(result, fill_value=na)
@copy(str_match)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def match(self, pat, case=True, flags=0, na=np.nan):
result = str_match(self._parent, pat, case=case, flags=flags, na=na)
return self._wrap_result(result, fill_value=na)
@copy(str_replace)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
- result = str_replace(self._parent, pat, repl, n=n, case=case,
- flags=flags, regex=regex)
+ result = str_replace(
+ self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex
+ )
return self._wrap_result(result)
@copy(str_repeat)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def repeat(self, repeats):
result = str_repeat(self._parent, repeats)
return self._wrap_result(result)
@copy(str_pad)
- @forbid_nonstring_types(['bytes'])
- def pad(self, width, side='left', fillchar=' '):
+ @forbid_nonstring_types(["bytes"])
+ def pad(self, width, side="left", fillchar=" "):
result = str_pad(self._parent, width, side=side, fillchar=fillchar)
return self._wrap_result(result)
- _shared_docs['str_pad'] = ("""
+ _shared_docs[
+ "str_pad"
+ ] = """
Filling %(side)s side of strings in the Series/Index with an
additional character. Equivalent to :meth:`str.%(method)s`.
@@ -2683,25 +2743,24 @@ def pad(self, width, side='left', fillchar=' '):
Returns
-------
filled : Series/Index of objects
- """)
+ """
- @Appender(_shared_docs['str_pad'] % dict(side='left and right',
- method='center'))
- @forbid_nonstring_types(['bytes'])
- def center(self, width, fillchar=' '):
- return self.pad(width, side='both', fillchar=fillchar)
+ @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center"))
+ @forbid_nonstring_types(["bytes"])
+ def center(self, width, fillchar=" "):
+ return self.pad(width, side="both", fillchar=fillchar)
- @Appender(_shared_docs['str_pad'] % dict(side='right', method='ljust'))
- @forbid_nonstring_types(['bytes'])
- def ljust(self, width, fillchar=' '):
- return self.pad(width, side='right', fillchar=fillchar)
+ @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust"))
+ @forbid_nonstring_types(["bytes"])
+ def ljust(self, width, fillchar=" "):
+ return self.pad(width, side="right", fillchar=fillchar)
- @Appender(_shared_docs['str_pad'] % dict(side='left', method='rjust'))
- @forbid_nonstring_types(['bytes'])
- def rjust(self, width, fillchar=' '):
- return self.pad(width, side='left', fillchar=fillchar)
+ @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust"))
+ @forbid_nonstring_types(["bytes"])
+ def rjust(self, width, fillchar=" "):
+ return self.pad(width, side="left", fillchar=fillchar)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def zfill(self, width):
"""
Pad strings in the Series/Index by prepending '0' characters.
@@ -2762,7 +2821,7 @@ def zfill(self, width):
4 NaN
dtype: object
"""
- result = str_pad(self._parent, width, side='left', fillchar='0')
+ result = str_pad(self._parent, width, side="left", fillchar="0")
return self._wrap_result(result)
@copy(str_slice)
@@ -2771,7 +2830,7 @@ def slice(self, start=None, stop=None, step=None):
return self._wrap_result(result)
@copy(str_slice_replace)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def slice_replace(self, start=None, stop=None, repl=None):
result = str_slice_replace(self._parent, start, stop, repl)
return self._wrap_result(result)
@@ -2783,12 +2842,14 @@ def decode(self, encoding, errors="strict"):
return self._wrap_result(result)
@copy(str_encode)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def encode(self, encoding, errors="strict"):
result = str_encode(self._parent, encoding, errors)
return self._wrap_result(result)
- _shared_docs['str_strip'] = (r"""
+ _shared_docs[
+ "str_strip"
+ ] = r"""
Remove leading and trailing characters.
Strip whitespaces (including newlines) or a set of specified characters
@@ -2849,67 +2910,69 @@ def encode(self, encoding, errors="strict"):
2 Cat
3 NaN
dtype: object
- """)
+ """
- @Appender(_shared_docs['str_strip'] % dict(side='left and right sides',
- method='strip'))
- @forbid_nonstring_types(['bytes'])
+ @Appender(
+ _shared_docs["str_strip"] % dict(side="left and right sides", method="strip")
+ )
+ @forbid_nonstring_types(["bytes"])
def strip(self, to_strip=None):
- result = str_strip(self._parent, to_strip, side='both')
+ result = str_strip(self._parent, to_strip, side="both")
return self._wrap_result(result)
- @Appender(_shared_docs['str_strip'] % dict(side='left side',
- method='lstrip'))
- @forbid_nonstring_types(['bytes'])
+ @Appender(_shared_docs["str_strip"] % dict(side="left side", method="lstrip"))
+ @forbid_nonstring_types(["bytes"])
def lstrip(self, to_strip=None):
- result = str_strip(self._parent, to_strip, side='left')
+ result = str_strip(self._parent, to_strip, side="left")
return self._wrap_result(result)
- @Appender(_shared_docs['str_strip'] % dict(side='right side',
- method='rstrip'))
- @forbid_nonstring_types(['bytes'])
+ @Appender(_shared_docs["str_strip"] % dict(side="right side", method="rstrip"))
+ @forbid_nonstring_types(["bytes"])
def rstrip(self, to_strip=None):
- result = str_strip(self._parent, to_strip, side='right')
+ result = str_strip(self._parent, to_strip, side="right")
return self._wrap_result(result)
@copy(str_wrap)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def wrap(self, width, **kwargs):
result = str_wrap(self._parent, width, **kwargs)
return self._wrap_result(result)
@copy(str_get_dummies)
- @forbid_nonstring_types(['bytes'])
- def get_dummies(self, sep='|'):
+ @forbid_nonstring_types(["bytes"])
+ def get_dummies(self, sep="|"):
# we need to cast to Series of strings as only that has all
# methods available for making the dummies...
data = self._orig.astype(str) if self._is_categorical else self._parent
result, name = str_get_dummies(data, sep)
- return self._wrap_result(result, use_codes=(not self._is_categorical),
- name=name, expand=True)
+ return self._wrap_result(
+ result, use_codes=(not self._is_categorical), name=name, expand=True
+ )
@copy(str_translate)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def translate(self, table):
result = str_translate(self._parent, table)
return self._wrap_result(result)
- count = _pat_wrapper(str_count, flags=True, name='count')
- startswith = _pat_wrapper(str_startswith, na=True, name='startswith')
- endswith = _pat_wrapper(str_endswith, na=True, name='endswith')
- findall = _pat_wrapper(str_findall, flags=True, name='findall')
+ count = _pat_wrapper(str_count, flags=True, name="count")
+ startswith = _pat_wrapper(str_startswith, na=True, name="startswith")
+ endswith = _pat_wrapper(str_endswith, na=True, name="endswith")
+ findall = _pat_wrapper(str_findall, flags=True, name="findall")
@copy(str_extract)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def extract(self, pat, flags=0, expand=True):
return str_extract(self, pat, flags=flags, expand=expand)
@copy(str_extractall)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def extractall(self, pat, flags=0):
return str_extractall(self._orig, pat, flags=flags)
- _shared_docs['find'] = ("""
+ _shared_docs[
+ "find"
+ ] = """
Return %(side)s indexes in each strings in the Series/Index
where the substring is fully contained between [start:end].
Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`.
@@ -2930,26 +2993,35 @@ def extractall(self, pat, flags=0):
See Also
--------
%(also)s
- """)
+ """
- @Appender(_shared_docs['find'] %
- dict(side='lowest', method='find',
- also='rfind : Return highest indexes in each strings.'))
- @forbid_nonstring_types(['bytes'])
+ @Appender(
+ _shared_docs["find"]
+ % dict(
+ side="lowest",
+ method="find",
+ also="rfind : Return highest indexes in each strings.",
+ )
+ )
+ @forbid_nonstring_types(["bytes"])
def find(self, sub, start=0, end=None):
- result = str_find(self._parent, sub, start=start, end=end, side='left')
+ result = str_find(self._parent, sub, start=start, end=end, side="left")
return self._wrap_result(result)
- @Appender(_shared_docs['find'] %
- dict(side='highest', method='rfind',
- also='find : Return lowest indexes in each strings.'))
- @forbid_nonstring_types(['bytes'])
+ @Appender(
+ _shared_docs["find"]
+ % dict(
+ side="highest",
+ method="rfind",
+ also="find : Return lowest indexes in each strings.",
+ )
+ )
+ @forbid_nonstring_types(["bytes"])
def rfind(self, sub, start=0, end=None):
- result = str_find(self._parent, sub,
- start=start, end=end, side='right')
+ result = str_find(self._parent, sub, start=start, end=end, side="right")
return self._wrap_result(result)
- @forbid_nonstring_types(['bytes'])
+ @forbid_nonstring_types(["bytes"])
def normalize(self, form):
"""
Return the Unicode normal form for the strings in the Series/Index.
@@ -2966,11 +3038,14 @@ def normalize(self, form):
normalized : Series/Index of objects
"""
import unicodedata
+
f = lambda x: unicodedata.normalize(form, x)
result = _na_map(f, self._parent)
return self._wrap_result(result)
- _shared_docs['index'] = ("""
+ _shared_docs[
+ "index"
+ ] = """
Return %(side)s indexes in each strings where the substring is
fully contained between [start:end]. This is the same as
``str.%(similar)s`` except instead of returning -1, it raises a ValueError
@@ -2992,27 +3067,39 @@ def normalize(self, form):
See Also
--------
%(also)s
- """)
+ """
- @Appender(_shared_docs['index'] %
- dict(side='lowest', similar='find', method='index',
- also='rindex : Return highest indexes in each strings.'))
- @forbid_nonstring_types(['bytes'])
+ @Appender(
+ _shared_docs["index"]
+ % dict(
+ side="lowest",
+ similar="find",
+ method="index",
+ also="rindex : Return highest indexes in each strings.",
+ )
+ )
+ @forbid_nonstring_types(["bytes"])
def index(self, sub, start=0, end=None):
- result = str_index(self._parent, sub,
- start=start, end=end, side='left')
+ result = str_index(self._parent, sub, start=start, end=end, side="left")
return self._wrap_result(result)
- @Appender(_shared_docs['index'] %
- dict(side='highest', similar='rfind', method='rindex',
- also='index : Return lowest indexes in each strings.'))
- @forbid_nonstring_types(['bytes'])
+ @Appender(
+ _shared_docs["index"]
+ % dict(
+ side="highest",
+ similar="rfind",
+ method="rindex",
+ also="index : Return lowest indexes in each strings.",
+ )
+ )
+ @forbid_nonstring_types(["bytes"])
def rindex(self, sub, start=0, end=None):
- result = str_index(self._parent, sub,
- start=start, end=end, side='right')
+ result = str_index(self._parent, sub, start=start, end=end, side="right")
return self._wrap_result(result)
- _shared_docs['len'] = ("""
+ _shared_docs[
+ "len"
+ ] = """
Compute the length of each element in the Series/Index. The element may be
a sequence (such as a string, tuple or list) or a collection
(such as a dictionary).
@@ -3055,11 +3142,14 @@ def rindex(self, sub, start=0, end=None):
4 4.0
5 3.0
dtype: float64
- """)
- len = _noarg_wrapper(len, docstring=_shared_docs['len'],
- forbidden_types=None, dtype=int)
+ """
+ len = _noarg_wrapper(
+ len, docstring=_shared_docs["len"], forbidden_types=None, dtype=int
+ )
- _shared_docs['casemethods'] = ("""
+ _shared_docs[
+ "casemethods"
+ ] = """
Convert strings in the Series/Index to %(type)s.
%(version)s
Equivalent to :meth:`str.%(method)s`.
@@ -3124,45 +3214,56 @@ def rindex(self, sub, start=0, end=None):
2 THIS IS A SENTENCE
3 sWaPcAsE
dtype: object
- """)
+ """
# _doc_args holds dict of strings to use in substituting casemethod docs
_doc_args = {} # type: Dict[str, Dict[str, str]]
- _doc_args['lower'] = dict(type='lowercase', method='lower', version='')
- _doc_args['upper'] = dict(type='uppercase', method='upper', version='')
- _doc_args['title'] = dict(type='titlecase', method='title', version='')
- _doc_args['capitalize'] = dict(type='be capitalized', method='capitalize',
- version='')
- _doc_args['swapcase'] = dict(type='be swapcased', method='swapcase',
- version='')
- _doc_args['casefold'] = dict(type='be casefolded', method='casefold',
- version='\n .. versionadded:: 0.25.0\n')
- lower = _noarg_wrapper(lambda x: x.lower(),
- name='lower',
- docstring=_shared_docs['casemethods'] %
- _doc_args['lower'])
- upper = _noarg_wrapper(lambda x: x.upper(),
- name='upper',
- docstring=_shared_docs['casemethods'] %
- _doc_args['upper'])
- title = _noarg_wrapper(lambda x: x.title(),
- name='title',
- docstring=_shared_docs['casemethods'] %
- _doc_args['title'])
- capitalize = _noarg_wrapper(lambda x: x.capitalize(),
- name='capitalize',
- docstring=_shared_docs['casemethods'] %
- _doc_args['capitalize'])
- swapcase = _noarg_wrapper(lambda x: x.swapcase(),
- name='swapcase',
- docstring=_shared_docs['casemethods'] %
- _doc_args['swapcase'])
- casefold = _noarg_wrapper(lambda x: x.casefold(),
- name='casefold',
- docstring=_shared_docs['casemethods'] %
- _doc_args['casefold'])
-
- _shared_docs['ismethods'] = ("""
+ _doc_args["lower"] = dict(type="lowercase", method="lower", version="")
+ _doc_args["upper"] = dict(type="uppercase", method="upper", version="")
+ _doc_args["title"] = dict(type="titlecase", method="title", version="")
+ _doc_args["capitalize"] = dict(
+ type="be capitalized", method="capitalize", version=""
+ )
+ _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="")
+ _doc_args["casefold"] = dict(
+ type="be casefolded",
+ method="casefold",
+ version="\n .. versionadded:: 0.25.0\n",
+ )
+ lower = _noarg_wrapper(
+ lambda x: x.lower(),
+ name="lower",
+ docstring=_shared_docs["casemethods"] % _doc_args["lower"],
+ )
+ upper = _noarg_wrapper(
+ lambda x: x.upper(),
+ name="upper",
+ docstring=_shared_docs["casemethods"] % _doc_args["upper"],
+ )
+ title = _noarg_wrapper(
+ lambda x: x.title(),
+ name="title",
+ docstring=_shared_docs["casemethods"] % _doc_args["title"],
+ )
+ capitalize = _noarg_wrapper(
+ lambda x: x.capitalize(),
+ name="capitalize",
+ docstring=_shared_docs["casemethods"] % _doc_args["capitalize"],
+ )
+ swapcase = _noarg_wrapper(
+ lambda x: x.swapcase(),
+ name="swapcase",
+ docstring=_shared_docs["casemethods"] % _doc_args["swapcase"],
+ )
+ casefold = _noarg_wrapper(
+ lambda x: x.casefold(),
+ name="casefold",
+ docstring=_shared_docs["casemethods"] % _doc_args["casefold"],
+ )
+
+ _shared_docs[
+ "ismethods"
+ ] = """
Check whether all characters in each string are %(type)s.
This is equivalent to running the Python string method
@@ -3301,52 +3402,61 @@ def rindex(self, sub, start=0, end=None):
2 False
3 False
dtype: bool
- """)
- _doc_args['isalnum'] = dict(type='alphanumeric', method='isalnum')
- _doc_args['isalpha'] = dict(type='alphabetic', method='isalpha')
- _doc_args['isdigit'] = dict(type='digits', method='isdigit')
- _doc_args['isspace'] = dict(type='whitespace', method='isspace')
- _doc_args['islower'] = dict(type='lowercase', method='islower')
- _doc_args['isupper'] = dict(type='uppercase', method='isupper')
- _doc_args['istitle'] = dict(type='titlecase', method='istitle')
- _doc_args['isnumeric'] = dict(type='numeric', method='isnumeric')
- _doc_args['isdecimal'] = dict(type='decimal', method='isdecimal')
- isalnum = _noarg_wrapper(lambda x: x.isalnum(),
- name='isalnum',
- docstring=_shared_docs['ismethods'] %
- _doc_args['isalnum'])
- isalpha = _noarg_wrapper(lambda x: x.isalpha(),
- name='isalpha',
- docstring=_shared_docs['ismethods'] %
- _doc_args['isalpha'])
- isdigit = _noarg_wrapper(lambda x: x.isdigit(),
- name='isdigit',
- docstring=_shared_docs['ismethods'] %
- _doc_args['isdigit'])
- isspace = _noarg_wrapper(lambda x: x.isspace(),
- name='isspace',
- docstring=_shared_docs['ismethods'] %
- _doc_args['isspace'])
- islower = _noarg_wrapper(lambda x: x.islower(),
- name='islower',
- docstring=_shared_docs['ismethods'] %
- _doc_args['islower'])
- isupper = _noarg_wrapper(lambda x: x.isupper(),
- name='isupper',
- docstring=_shared_docs['ismethods'] %
- _doc_args['isupper'])
- istitle = _noarg_wrapper(lambda x: x.istitle(),
- name='istitle',
- docstring=_shared_docs['ismethods'] %
- _doc_args['istitle'])
- isnumeric = _noarg_wrapper(lambda x: x.isnumeric(),
- name='isnumeric',
- docstring=_shared_docs['ismethods'] %
- _doc_args['isnumeric'])
- isdecimal = _noarg_wrapper(lambda x: x.isdecimal(),
- name='isdecimal',
- docstring=_shared_docs['ismethods'] %
- _doc_args['isdecimal'])
+ """
+ _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum")
+ _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha")
+ _doc_args["isdigit"] = dict(type="digits", method="isdigit")
+ _doc_args["isspace"] = dict(type="whitespace", method="isspace")
+ _doc_args["islower"] = dict(type="lowercase", method="islower")
+ _doc_args["isupper"] = dict(type="uppercase", method="isupper")
+ _doc_args["istitle"] = dict(type="titlecase", method="istitle")
+ _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric")
+ _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal")
+ isalnum = _noarg_wrapper(
+ lambda x: x.isalnum(),
+ name="isalnum",
+ docstring=_shared_docs["ismethods"] % _doc_args["isalnum"],
+ )
+ isalpha = _noarg_wrapper(
+ lambda x: x.isalpha(),
+ name="isalpha",
+ docstring=_shared_docs["ismethods"] % _doc_args["isalpha"],
+ )
+ isdigit = _noarg_wrapper(
+ lambda x: x.isdigit(),
+ name="isdigit",
+ docstring=_shared_docs["ismethods"] % _doc_args["isdigit"],
+ )
+ isspace = _noarg_wrapper(
+ lambda x: x.isspace(),
+ name="isspace",
+ docstring=_shared_docs["ismethods"] % _doc_args["isspace"],
+ )
+ islower = _noarg_wrapper(
+ lambda x: x.islower(),
+ name="islower",
+ docstring=_shared_docs["ismethods"] % _doc_args["islower"],
+ )
+ isupper = _noarg_wrapper(
+ lambda x: x.isupper(),
+ name="isupper",
+ docstring=_shared_docs["ismethods"] % _doc_args["isupper"],
+ )
+ istitle = _noarg_wrapper(
+ lambda x: x.istitle(),
+ name="istitle",
+ docstring=_shared_docs["ismethods"] % _doc_args["istitle"],
+ )
+ isnumeric = _noarg_wrapper(
+ lambda x: x.isnumeric(),
+ name="isnumeric",
+ docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"],
+ )
+ isdecimal = _noarg_wrapper(
+ lambda x: x.isdecimal(),
+ name="isdecimal",
+ docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"],
+ )
@classmethod
def _make_accessor(cls, data):
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 3e3318ed4c4b6..e9d2c3f07bfae 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -8,16 +8,33 @@
from pandas._libs import tslib, tslibs
from pandas._libs.tslibs import Timestamp, conversion, parsing
from pandas._libs.tslibs.parsing import ( # noqa
- DateParseError, _format_is_iso, _guess_datetime_format, parse_time_string)
+ DateParseError,
+ _format_is_iso,
+ _guess_datetime_format,
+ parse_time_string,
+)
from pandas._libs.tslibs.strptime import array_strptime
from pandas.util._decorators import deprecate_kwarg
from pandas.core.dtypes.common import (
- ensure_object, is_datetime64_dtype, is_datetime64_ns_dtype,
- is_datetime64tz_dtype, is_float, is_integer, is_integer_dtype,
- is_list_like, is_numeric_dtype, is_scalar)
+ ensure_object,
+ is_datetime64_dtype,
+ is_datetime64_ns_dtype,
+ is_datetime64tz_dtype,
+ is_float,
+ is_integer,
+ is_integer_dtype,
+ is_list_like,
+ is_numeric_dtype,
+ is_scalar,
+)
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCDatetimeIndex, ABCIndex, ABCIndexClass, ABCSeries)
+ ABCDataFrame,
+ ABCDatetimeIndex,
+ ABCIndex,
+ ABCIndexClass,
+ ABCSeries,
+)
from pandas.core.dtypes.missing import notna
from pandas._typing import ArrayLike
@@ -35,9 +52,10 @@
# types used in annotations
Scalar = Union[int, float, str]
-DatetimeScalar = TypeVar('DatetimeScalar', Scalar, datetime)
-DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, list, tuple,
- ArrayLike, ABCSeries]
+DatetimeScalar = TypeVar("DatetimeScalar", Scalar, datetime)
+DatetimeScalarOrArrayConvertible = Union[
+ DatetimeScalar, list, tuple, ArrayLike, ABCSeries
+]
# ---------------------------------------------------------------------
@@ -50,8 +68,9 @@ def _guess_datetime_format_for_array(arr, **kwargs):
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
-def should_cache(arg: ArrayConvertible, unique_share: float = 0.7,
- check_count: Optional[int] = None) -> bool:
+def should_cache(
+ arg: ArrayConvertible, unique_share: float = 0.7, check_count: Optional[int] = None
+) -> bool:
"""
Decides whether to do caching.
@@ -91,12 +110,13 @@ def should_cache(arg: ArrayConvertible, unique_share: float = 0.7,
else:
check_count = 500
else:
- assert 0 <= check_count <= len(arg), \
- 'check_count must be in next bounds: [0; len(arg)]'
+ assert (
+ 0 <= check_count <= len(arg)
+ ), "check_count must be in next bounds: [0; len(arg)]"
if check_count == 0:
return False
- assert 0 < unique_share < 1, 'unique_share must be in next bounds: (0; 1)'
+ assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)"
unique_elements = unique(arg[:check_count])
if len(unique_elements) > check_count * unique_share:
@@ -124,6 +144,7 @@ def _maybe_cache(arg, format, cache, convert_listlike):
Cache of converted, unique dates. Can be empty
"""
from pandas import Series
+
cache_array = Series()
if cache:
# Perform a quicker unique check
@@ -138,9 +159,7 @@ def _maybe_cache(arg, format, cache, convert_listlike):
def _box_as_indexlike(
- dt_array: ArrayLike,
- utc: Optional[bool] = None,
- name: Optional[str] = None
+ dt_array: ArrayLike, utc: Optional[bool] = None, name: Optional[str] = None
) -> Union[ABCIndex, ABCDatetimeIndex]:
"""
Properly boxes the ndarray of datetimes to DatetimeIndex
@@ -162,8 +181,9 @@ def _box_as_indexlike(
- general Index otherwise
"""
from pandas import DatetimeIndex, Index
+
if is_datetime64_dtype(dt_array):
- tz = 'utc' if utc else None
+ tz = "utc" if utc else None
return DatetimeIndex(dt_array, tz=tz, name=name)
return Index(dt_array, name=name)
@@ -172,7 +192,7 @@ def _convert_and_box_cache(
arg: DatetimeScalarOrArrayConvertible,
cache_array: ABCSeries,
box: bool,
- name: Optional[str] = None
+ name: Optional[str] = None,
) -> Union[ABCIndex, np.ndarray]:
"""
Convert array of dates with a cache and box the result
@@ -194,6 +214,7 @@ def _convert_and_box_cache(
- ndarray if box=False
"""
from pandas import Series
+
result = Series(arg).map(cache_array)
if box:
return _box_as_indexlike(result, utc=None, name=name)
@@ -226,21 +247,34 @@ def _return_parsed_timezone_results(result, timezones, box, tz, name):
- ndarray of Timestamps if box=False
"""
if tz is not None:
- raise ValueError("Cannot pass a tz argument when "
- "parsing strings with timezone "
- "information.")
- tz_results = np.array([Timestamp(res).tz_localize(zone) for res, zone
- in zip(result, timezones)])
+ raise ValueError(
+ "Cannot pass a tz argument when "
+ "parsing strings with timezone "
+ "information."
+ )
+ tz_results = np.array(
+ [Timestamp(res).tz_localize(zone) for res, zone in zip(result, timezones)]
+ )
if box:
from pandas import Index
+
return Index(tz_results, name=name)
return tz_results
-def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
- unit=None, errors=None,
- infer_datetime_format=None, dayfirst=None,
- yearfirst=None, exact=None):
+def _convert_listlike_datetimes(
+ arg,
+ box,
+ format,
+ name=None,
+ tz=None,
+ unit=None,
+ errors=None,
+ infer_datetime_format=None,
+ dayfirst=None,
+ yearfirst=None,
+ exact=None,
+):
"""
Helper function for to_datetime. Performs the conversions of 1D listlike
of dates
@@ -279,16 +313,18 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
from pandas import DatetimeIndex
from pandas.core.arrays import DatetimeArray
from pandas.core.arrays.datetimes import (
- maybe_convert_dtype, objects_to_datetime64ns)
+ maybe_convert_dtype,
+ objects_to_datetime64ns,
+ )
if isinstance(arg, (list, tuple)):
- arg = np.array(arg, dtype='O')
+ arg = np.array(arg, dtype="O")
# these are shortcutable
if is_datetime64tz_dtype(arg):
if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
return DatetimeIndex(arg, tz=tz, name=name)
- if tz == 'utc':
+ if tz == "utc":
arg = arg.tz_convert(None).tz_localize(tz)
return arg
@@ -304,12 +340,12 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
elif unit is not None:
if format is not None:
raise ValueError("cannot specify both format and unit")
- arg = getattr(arg, 'values', arg)
- result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit,
- errors=errors)
+ arg = getattr(arg, "values", arg)
+ result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
if box:
- if errors == 'ignore':
+ if errors == "ignore":
from pandas import Index
+
result = Index(result, name=name)
else:
result = DatetimeIndex(result, name=name)
@@ -317,7 +353,7 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
# GH 25546: Apply tz_parsed first (from arg), then tz (from caller)
# result will be naive but in UTC
try:
- result = result.tz_localize('UTC').tz_convert(tz_parsed)
+ result = result.tz_localize("UTC").tz_convert(tz_parsed)
except AttributeError:
# Regular Index from 'ignore' path
return result
@@ -327,9 +363,10 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
else:
result = result.tz_convert(tz)
return result
- elif getattr(arg, 'ndim', 1) > 1:
- raise TypeError('arg must be a string, datetime, list, tuple, '
- '1-d array, or Series')
+ elif getattr(arg, "ndim", 1) > 1:
+ raise TypeError(
+ "arg must be a string, datetime, list, tuple, " "1-d array, or Series"
+ )
# warn if passing timedelta64, raise for PeriodDtype
# NB: this must come after unit transformation
@@ -358,30 +395,33 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
if format is not None:
try:
# shortcut formatting here
- if format == '%Y%m%d':
+ if format == "%Y%m%d":
try:
# pass orig_arg as float-dtype may have been converted to
# datetime64[ns]
orig_arg = ensure_object(orig_arg)
result = _attempt_YYYYMMDD(orig_arg, errors=errors)
except (ValueError, TypeError, tslibs.OutOfBoundsDatetime):
- raise ValueError("cannot convert the input to "
- "'%Y%m%d' date format")
+ raise ValueError(
+ "cannot convert the input to " "'%Y%m%d' date format"
+ )
# fallback
if result is None:
try:
result, timezones = array_strptime(
- arg, format, exact=exact, errors=errors)
- if '%Z' in format or '%z' in format:
+ arg, format, exact=exact, errors=errors
+ )
+ if "%Z" in format or "%z" in format:
return _return_parsed_timezone_results(
- result, timezones, box, tz, name)
+ result, timezones, box, tz, name
+ )
except tslibs.OutOfBoundsDatetime:
- if errors == 'raise':
+ if errors == "raise":
raise
- elif errors == 'coerce':
- result = np.empty(arg.shape, dtype='M8[ns]')
- iresult = result.view('i8')
+ elif errors == "coerce":
+ result = np.empty(arg.shape, dtype="M8[ns]")
+ iresult = result.view("i8")
iresult.fill(tslibs.iNaT)
else:
result = arg
@@ -390,11 +430,11 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
# to array_to_datetime - terminate here
# for specified formats
if not infer_datetime_format:
- if errors == 'raise':
+ if errors == "raise":
raise
- elif errors == 'coerce':
- result = np.empty(arg.shape, dtype='M8[ns]')
- iresult = result.view('i8')
+ elif errors == "coerce":
+ result = np.empty(arg.shape, dtype="M8[ns]")
+ iresult = result.view("i8")
iresult.fill(tslibs.iNaT)
else:
result = arg
@@ -409,27 +449,30 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
if result is None:
assert format is None or infer_datetime_format
- utc = tz == 'utc'
+ utc = tz == "utc"
result, tz_parsed = objects_to_datetime64ns(
- arg, dayfirst=dayfirst, yearfirst=yearfirst,
- utc=utc, errors=errors, require_iso8601=require_iso8601,
- allow_object=True)
+ arg,
+ dayfirst=dayfirst,
+ yearfirst=yearfirst,
+ utc=utc,
+ errors=errors,
+ require_iso8601=require_iso8601,
+ allow_object=True,
+ )
if tz_parsed is not None:
if box:
# We can take a shortcut since the datetime64 numpy array
# is in UTC
- return DatetimeIndex._simple_new(result, name=name,
- tz=tz_parsed)
+ return DatetimeIndex._simple_new(result, name=name, tz=tz_parsed)
else:
# Convert the datetime64 numpy array to an numpy array
# of datetime objects
- result = [Timestamp(ts, tz=tz_parsed).to_pydatetime()
- for ts in result]
+ result = [Timestamp(ts, tz=tz_parsed).to_pydatetime() for ts in result]
return np.array(result, dtype=object)
if box:
- utc = tz == 'utc'
+ utc = tz == "utc"
return _box_as_indexlike(result, utc=utc, name=name)
return result
@@ -452,16 +495,15 @@ def _adjust_to_origin(arg, origin, unit):
-------
ndarray or scalar of adjusted date(s)
"""
- if origin == 'julian':
+ if origin == "julian":
original = arg
j0 = Timestamp(0).to_julian_date()
- if unit != 'D':
+ if unit != "D":
raise ValueError("unit must be 'D' for origin='julian'")
try:
arg = arg - j0
except TypeError:
- raise ValueError("incompatible 'arg' type for given "
- "'origin'='julian'")
+ raise ValueError("incompatible 'arg' type for given " "'origin'='julian'")
# preemptively check this for a nice range
j_max = Timestamp.max.to_julian_date() - j0
@@ -469,30 +511,36 @@ def _adjust_to_origin(arg, origin, unit):
if np.any(arg > j_max) or np.any(arg < j_min):
raise tslibs.OutOfBoundsDatetime(
"{original} is Out of Bounds for "
- "origin='julian'".format(original=original))
+ "origin='julian'".format(original=original)
+ )
else:
# arg must be numeric
- if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or
- is_numeric_dtype(np.asarray(arg))):
+ if not (
+ (is_scalar(arg) and (is_integer(arg) or is_float(arg)))
+ or is_numeric_dtype(np.asarray(arg))
+ ):
raise ValueError(
"'{arg}' is not compatible with origin='{origin}'; "
"it must be numeric with a unit specified ".format(
- arg=arg,
- origin=origin))
+ arg=arg, origin=origin
+ )
+ )
# we are going to offset back to unix / epoch time
try:
offset = Timestamp(origin)
except tslibs.OutOfBoundsDatetime:
raise tslibs.OutOfBoundsDatetime(
- "origin {origin} is Out of Bounds".format(origin=origin))
+ "origin {origin} is Out of Bounds".format(origin=origin)
+ )
except ValueError:
- raise ValueError("origin {origin} cannot be converted "
- "to a Timestamp".format(origin=origin))
+ raise ValueError(
+ "origin {origin} cannot be converted "
+ "to a Timestamp".format(origin=origin)
+ )
if offset.tz is not None:
- raise ValueError(
- "origin offset {} must be tz-naive".format(offset))
+ raise ValueError("origin offset {} must be tz-naive".format(offset))
offset -= Timestamp(0)
# convert the offset to the unit of the arg
@@ -501,17 +549,28 @@ def _adjust_to_origin(arg, origin, unit):
# scalars & ndarray-like can handle the addition
if is_list_like(arg) and not isinstance(
- arg, (ABCSeries, ABCIndexClass, np.ndarray)):
+ arg, (ABCSeries, ABCIndexClass, np.ndarray)
+ ):
arg = np.asarray(arg)
arg = arg + offset
return arg
-@deprecate_kwarg(old_arg_name='box', new_arg_name=None)
-def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
- utc=None, box=True, format=None, exact=True,
- unit=None, infer_datetime_format=False, origin='unix',
- cache=True):
+@deprecate_kwarg(old_arg_name="box", new_arg_name=None)
+def to_datetime(
+ arg,
+ errors="raise",
+ dayfirst=False,
+ yearfirst=False,
+ utc=None,
+ box=True,
+ format=None,
+ exact=True,
+ unit=None,
+ infer_datetime_format=False,
+ origin="unix",
+ cache=True,
+):
"""
Convert argument to datetime.
@@ -686,14 +745,20 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
if arg is None:
return None
- if origin != 'unix':
+ if origin != "unix":
arg = _adjust_to_origin(arg, origin, unit)
- tz = 'utc' if utc else None
- convert_listlike = partial(_convert_listlike_datetimes, tz=tz, unit=unit,
- dayfirst=dayfirst, yearfirst=yearfirst,
- errors=errors, exact=exact,
- infer_datetime_format=infer_datetime_format)
+ tz = "utc" if utc else None
+ convert_listlike = partial(
+ _convert_listlike_datetimes,
+ tz=tz,
+ unit=unit,
+ dayfirst=dayfirst,
+ yearfirst=yearfirst,
+ errors=errors,
+ exact=exact,
+ infer_datetime_format=infer_datetime_format,
+ )
if isinstance(arg, Timestamp):
result = arg
@@ -714,8 +779,7 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
elif isinstance(arg, ABCIndexClass):
cache_array = _maybe_cache(arg, format, cache, convert_listlike)
if not cache_array.empty:
- result = _convert_and_box_cache(arg, cache_array, box,
- name=arg.name)
+ result = _convert_and_box_cache(arg, cache_array, box, name=arg.name)
else:
convert_listlike = partial(convert_listlike, name=arg.name)
result = convert_listlike(arg, box, format)
@@ -732,28 +796,29 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
# mappings for assembling units
-_unit_map = {'year': 'year',
- 'years': 'year',
- 'month': 'month',
- 'months': 'month',
- 'day': 'day',
- 'days': 'day',
- 'hour': 'h',
- 'hours': 'h',
- 'minute': 'm',
- 'minutes': 'm',
- 'second': 's',
- 'seconds': 's',
- 'ms': 'ms',
- 'millisecond': 'ms',
- 'milliseconds': 'ms',
- 'us': 'us',
- 'microsecond': 'us',
- 'microseconds': 'us',
- 'ns': 'ns',
- 'nanosecond': 'ns',
- 'nanoseconds': 'ns'
- }
+_unit_map = {
+ "year": "year",
+ "years": "year",
+ "month": "month",
+ "months": "month",
+ "day": "day",
+ "days": "day",
+ "hour": "h",
+ "hours": "h",
+ "minute": "m",
+ "minutes": "m",
+ "second": "s",
+ "seconds": "s",
+ "ms": "ms",
+ "millisecond": "ms",
+ "milliseconds": "ms",
+ "us": "us",
+ "microsecond": "us",
+ "microseconds": "us",
+ "ns": "ns",
+ "nanosecond": "ns",
+ "nanoseconds": "ns",
+}
def _assemble_from_unit_mappings(arg, errors, box, tz):
@@ -780,6 +845,7 @@ def _assemble_from_unit_mappings(arg, errors, box, tz):
Series
"""
from pandas import to_timedelta, to_numeric, DataFrame
+
arg = DataFrame(arg)
if not arg.columns.is_unique:
raise ValueError("cannot assemble with duplicate keys")
@@ -799,19 +865,23 @@ def f(value):
unit_rev = {v: k for k, v in unit.items()}
# we require at least Ymd
- required = ['year', 'month', 'day']
+ required = ["year", "month", "day"]
req = sorted(list(set(required) - set(unit_rev.keys())))
if len(req):
- raise ValueError("to assemble mappings requires at least that "
- "[year, month, day] be specified: [{required}] "
- "is missing".format(required=','.join(req)))
+ raise ValueError(
+ "to assemble mappings requires at least that "
+ "[year, month, day] be specified: [{required}] "
+ "is missing".format(required=",".join(req))
+ )
# keys we don't recognize
excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values())))
if len(excess):
- raise ValueError("extra keys have been passed "
- "to the datetime assemblage: "
- "[{excess}]".format(excess=','.join(excess)))
+ raise ValueError(
+ "extra keys have been passed "
+ "to the datetime assemblage: "
+ "[{excess}]".format(excess=",".join(excess))
+ )
def coerce(values):
# we allow coercion to if errors allows
@@ -819,28 +889,29 @@ def coerce(values):
# prevent overflow in case of int8 or int16
if is_integer_dtype(values):
- values = values.astype('int64', copy=False)
+ values = values.astype("int64", copy=False)
return values
- values = (coerce(arg[unit_rev['year']]) * 10000 +
- coerce(arg[unit_rev['month']]) * 100 +
- coerce(arg[unit_rev['day']]))
+ values = (
+ coerce(arg[unit_rev["year"]]) * 10000
+ + coerce(arg[unit_rev["month"]]) * 100
+ + coerce(arg[unit_rev["day"]])
+ )
try:
- values = to_datetime(values, format='%Y%m%d', errors=errors, utc=tz)
+ values = to_datetime(values, format="%Y%m%d", errors=errors, utc=tz)
except (TypeError, ValueError) as e:
- raise ValueError("cannot assemble the "
- "datetimes: {error}".format(error=e))
+ raise ValueError("cannot assemble the " "datetimes: {error}".format(error=e))
- for u in ['h', 'm', 's', 'ms', 'us', 'ns']:
+ for u in ["h", "m", "s", "ms", "us", "ns"]:
value = unit_rev.get(u)
if value is not None and value in arg:
try:
- values += to_timedelta(coerce(arg[value]),
- unit=u,
- errors=errors)
+ values += to_timedelta(coerce(arg[value]), unit=u, errors=errors)
except (TypeError, ValueError) as e:
- raise ValueError("cannot assemble the datetimes [{value}]: "
- "{error}".format(value=value, error=e))
+ raise ValueError(
+ "cannot assemble the datetimes [{value}]: "
+ "{error}".format(value=value, error=e)
+ )
if not box:
return values.values
return values
@@ -861,18 +932,18 @@ def _attempt_YYYYMMDD(arg, errors):
def calc(carg):
# calculate the actual result
carg = carg.astype(object)
- parsed = parsing.try_parse_year_month_day(carg / 10000,
- carg / 100 % 100,
- carg % 100)
+ parsed = parsing.try_parse_year_month_day(
+ carg / 10000, carg / 100 % 100, carg % 100
+ )
return tslib.array_to_datetime(parsed, errors=errors)[0]
def calc_with_mask(carg, mask):
- result = np.empty(carg.shape, dtype='M8[ns]')
- iresult = result.view('i8')
+ result = np.empty(carg.shape, dtype="M8[ns]")
+ iresult = result.view("i8")
iresult[~mask] = tslibs.iNaT
masked_result = calc(carg[mask].astype(np.float64).astype(np.int64))
- result[mask] = masked_result.astype('M8[ns]')
+ result[mask] = masked_result.astype("M8[ns]")
return result
# try intlike / strings that are ints
@@ -899,8 +970,16 @@ def calc_with_mask(carg, mask):
# Fixed time formats for time parsing
-_time_formats = ["%H:%M", "%H%M", "%I:%M%p", "%I%M%p",
- "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"]
+_time_formats = [
+ "%H:%M",
+ "%H%M",
+ "%I:%M%p",
+ "%I%M%p",
+ "%H:%M:%S",
+ "%H%M%S",
+ "%I:%M:%S%p",
+ "%I%M%S%p",
+]
def _guess_time_format_for_array(arr):
@@ -918,7 +997,7 @@ def _guess_time_format_for_array(arr):
return None
-def to_time(arg, format=None, infer_time_format=False, errors='raise'):
+def to_time(arg, format=None, infer_time_format=False, errors="raise"):
"""
Parse time strings to time objects using fixed strptime formats ("%H:%M",
"%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p",
@@ -949,11 +1028,12 @@ def to_time(arg, format=None, infer_time_format=False, errors='raise'):
def _convert_listlike(arg, format):
if isinstance(arg, (list, tuple)):
- arg = np.array(arg, dtype='O')
+ arg = np.array(arg, dtype="O")
- elif getattr(arg, 'ndim', 1) > 1:
- raise TypeError('arg must be a string, datetime, list, tuple, '
- '1-d array, or Series')
+ elif getattr(arg, "ndim", 1) > 1:
+ raise TypeError(
+ "arg must be a string, datetime, list, tuple, " "1-d array, or Series"
+ )
arg = ensure_object(arg)
@@ -966,12 +1046,13 @@ def _convert_listlike(arg, format):
try:
times.append(datetime.strptime(element, format).time())
except (ValueError, TypeError):
- if errors == 'raise':
- msg = ("Cannot convert {element} to a time with given "
- "format {format}").format(element=element,
- format=format)
+ if errors == "raise":
+ msg = (
+ "Cannot convert {element} to a time with given "
+ "format {format}"
+ ).format(element=element, format=format)
raise ValueError(msg)
- elif errors == 'ignore':
+ elif errors == "ignore":
return arg
else:
times.append(None)
@@ -982,8 +1063,7 @@ def _convert_listlike(arg, format):
time_object = None
for time_format in formats:
try:
- time_object = datetime.strptime(element,
- time_format).time()
+ time_object = datetime.strptime(element, time_format).time()
if not format_found:
# Put the found format in front
fmt = formats.pop(formats.index(time_format))
@@ -995,10 +1075,11 @@ def _convert_listlike(arg, format):
if time_object is not None:
times.append(time_object)
- elif errors == 'raise':
- raise ValueError("Cannot convert arg {arg} to "
- "a time".format(arg=arg))
- elif errors == 'ignore':
+ elif errors == "raise":
+ raise ValueError(
+ "Cannot convert arg {arg} to " "a time".format(arg=arg)
+ )
+ elif errors == "ignore":
return arg
else:
times.append(None)
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
index d7a1b1119ce4b..e1a976b874c25 100644
--- a/pandas/core/tools/numeric.py
+++ b/pandas/core/tools/numeric.py
@@ -4,14 +4,19 @@
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import (
- ensure_object, is_datetime_or_timedelta_dtype, is_decimal, is_number,
- is_numeric_dtype, is_scalar)
+ ensure_object,
+ is_datetime_or_timedelta_dtype,
+ is_decimal,
+ is_number,
+ is_numeric_dtype,
+ is_scalar,
+)
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
import pandas as pd
-def to_numeric(arg, errors='raise', downcast=None):
+def to_numeric(arg, errors="raise", downcast=None):
"""
Convert argument to a numeric type.
@@ -102,11 +107,11 @@ def to_numeric(arg, errors='raise', downcast=None):
3 -3.0
dtype: float64
"""
- if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):
- raise ValueError('invalid downcasting method provided')
+ if downcast not in (None, "integer", "signed", "unsigned", "float"):
+ raise ValueError("invalid downcasting method provided")
- if errors not in ('ignore', 'raise', 'coerce'):
- raise ValueError('invalid error value specified')
+ if errors not in ("ignore", "raise", "coerce"):
+ raise ValueError("invalid error value specified")
is_series = False
is_index = False
@@ -121,16 +126,16 @@ def to_numeric(arg, errors='raise', downcast=None):
if values is None:
values = arg.values
elif isinstance(arg, (list, tuple)):
- values = np.array(arg, dtype='O')
+ values = np.array(arg, dtype="O")
elif is_scalar(arg):
if is_decimal(arg):
return float(arg)
if is_number(arg):
return arg
is_scalars = True
- values = np.array([arg], dtype='O')
- elif getattr(arg, 'ndim', 1) > 1:
- raise TypeError('arg must be a list, tuple, 1-d array, or Series')
+ values = np.array([arg], dtype="O")
+ elif getattr(arg, "ndim", 1) > 1:
+ raise TypeError("arg must be a list, tuple, 1-d array, or Series")
else:
values = arg
@@ -141,12 +146,13 @@ def to_numeric(arg, errors='raise', downcast=None):
values = values.astype(np.int64)
else:
values = ensure_object(values)
- coerce_numeric = errors not in ('ignore', 'raise')
- values = lib.maybe_convert_numeric(values, set(),
- coerce_numeric=coerce_numeric)
+ coerce_numeric = errors not in ("ignore", "raise")
+ values = lib.maybe_convert_numeric(
+ values, set(), coerce_numeric=coerce_numeric
+ )
except Exception:
- if errors == 'raise':
+ if errors == "raise":
raise
# attempt downcast only if the data has been successfully converted
@@ -154,12 +160,12 @@ def to_numeric(arg, errors='raise', downcast=None):
if downcast is not None and is_numeric_dtype(values):
typecodes = None
- if downcast in ('integer', 'signed'):
- typecodes = np.typecodes['Integer']
- elif downcast == 'unsigned' and np.min(values) >= 0:
- typecodes = np.typecodes['UnsignedInteger']
- elif downcast == 'float':
- typecodes = np.typecodes['Float']
+ if downcast in ("integer", "signed"):
+ typecodes = np.typecodes["Integer"]
+ elif downcast == "unsigned" and np.min(values) >= 0:
+ typecodes = np.typecodes["UnsignedInteger"]
+ elif downcast == "float":
+ typecodes = np.typecodes["Float"]
# pandas support goes only to np.float32,
# as float dtypes smaller than that are
diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py
index 5e89b73c8754e..2c594a3df27ea 100644
--- a/pandas/core/tools/timedeltas.py
+++ b/pandas/core/tools/timedeltas.py
@@ -16,8 +16,8 @@
from pandas.core.arrays.timedeltas import sequence_to_td64ns
-@deprecate_kwarg(old_arg_name='box', new_arg_name=None)
-def to_timedelta(arg, unit='ns', box=True, errors='raise'):
+@deprecate_kwarg(old_arg_name="box", new_arg_name=None)
+def to_timedelta(arg, unit="ns", box=True, errors="raise"):
"""
Convert argument to timedelta.
@@ -96,50 +96,49 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'):
"""
unit = parse_timedelta_unit(unit)
- if errors not in ('ignore', 'raise', 'coerce'):
- raise ValueError("errors must be one of 'ignore', "
- "'raise', or 'coerce'}")
+ if errors not in ("ignore", "raise", "coerce"):
+ raise ValueError("errors must be one of 'ignore', " "'raise', or 'coerce'}")
- if unit in {'Y', 'y', 'M'}:
- warnings.warn("M and Y units are deprecated and "
- "will be removed in a future version.",
- FutureWarning, stacklevel=2)
+ if unit in {"Y", "y", "M"}:
+ warnings.warn(
+ "M and Y units are deprecated and " "will be removed in a future version.",
+ FutureWarning,
+ stacklevel=2,
+ )
if arg is None:
return arg
elif isinstance(arg, ABCSeries):
- values = _convert_listlike(arg._values, unit=unit,
- box=False, errors=errors)
+ values = _convert_listlike(arg._values, unit=unit, box=False, errors=errors)
return arg._constructor(values, index=arg.index, name=arg.name)
elif isinstance(arg, ABCIndexClass):
- return _convert_listlike(arg, unit=unit, box=box,
- errors=errors, name=arg.name)
+ return _convert_listlike(arg, unit=unit, box=box, errors=errors, name=arg.name)
elif isinstance(arg, np.ndarray) and arg.ndim == 0:
# extract array scalar and process below
arg = arg.item()
- elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1:
+ elif is_list_like(arg) and getattr(arg, "ndim", 1) == 1:
return _convert_listlike(arg, unit=unit, box=box, errors=errors)
- elif getattr(arg, 'ndim', 1) > 1:
- raise TypeError('arg must be a string, timedelta, list, tuple, '
- '1-d array, or Series')
+ elif getattr(arg, "ndim", 1) > 1:
+ raise TypeError(
+ "arg must be a string, timedelta, list, tuple, " "1-d array, or Series"
+ )
# ...so it must be a scalar value. Return scalar.
- return _coerce_scalar_to_timedelta_type(arg, unit=unit,
- box=box, errors=errors)
+ return _coerce_scalar_to_timedelta_type(arg, unit=unit, box=box, errors=errors)
-def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'):
+def _coerce_scalar_to_timedelta_type(r, unit="ns", box=True, errors="raise"):
"""Convert string 'r' to a timedelta object."""
try:
result = Timedelta(r, unit)
if not box:
# explicitly view as timedelta64 for case when result is pd.NaT
- result = result.asm8.view('timedelta64[ns]')
+ result = result.asm8.view("timedelta64[ns]")
except ValueError:
- if errors == 'raise':
+ if errors == "raise":
raise
- elif errors == 'ignore':
+ elif errors == "ignore":
return r
# coerce
@@ -148,10 +147,10 @@ def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'):
return result
-def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None):
+def _convert_listlike(arg, unit="ns", box=True, errors="raise", name=None):
"""Convert a list of objects to a timedelta index object."""
- if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'):
+ if isinstance(arg, (list, tuple)) or not hasattr(arg, "dtype"):
# This is needed only to ensure that in the case where we end up
# returning arg (errors == "ignore"), and where the input is a
# generator, we return a useful list-like instead of a
@@ -159,10 +158,9 @@ def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None):
arg = np.array(list(arg), dtype=object)
try:
- value = sequence_to_td64ns(arg, unit=unit,
- errors=errors, copy=False)[0]
+ value = sequence_to_td64ns(arg, unit=unit, errors=errors, copy=False)[0]
except ValueError:
- if errors == 'ignore':
+ if errors == "ignore":
return arg
else:
# This else-block accounts for the cases when errors='raise'
@@ -176,5 +174,6 @@ def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None):
if box:
from pandas import TimedeltaIndex
- value = TimedeltaIndex(value, unit='ns', name=name)
+
+ value = TimedeltaIndex(value, unit="ns", name=name)
return value
diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py
index a916f2f06df21..f07133baed435 100644
--- a/pandas/core/util/hashing.py
+++ b/pandas/core/util/hashing.py
@@ -10,13 +10,20 @@
from pandas.core.dtypes.cast import infer_dtype_from_scalar
from pandas.core.dtypes.common import (
- is_categorical_dtype, is_extension_array_dtype, is_list_like)
+ is_categorical_dtype,
+ is_extension_array_dtype,
+ is_list_like,
+)
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCSeries)
+ ABCDataFrame,
+ ABCIndexClass,
+ ABCMultiIndex,
+ ABCSeries,
+)
from pandas.core.dtypes.missing import isna
# 16 byte long hashing key
-_default_hash_key = '0123456789123456'
+_default_hash_key = "0123456789123456"
def _combine_hash_arrays(arrays, num_items):
@@ -42,13 +49,14 @@ def _combine_hash_arrays(arrays, num_items):
out ^= a
out *= mult
mult += np.uint64(82520 + inverse_i + inverse_i)
- assert i + 1 == num_items, 'Fed in wrong num_items'
+ assert i + 1 == num_items, "Fed in wrong num_items"
out += np.uint64(97531)
return out
-def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
- categorize=True):
+def hash_pandas_object(
+ obj, index=True, encoding="utf8", hash_key=None, categorize=True
+):
"""
Return a data hash of the Index/Series/DataFrame
@@ -72,53 +80,63 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
Series of uint64, same length as the object
"""
from pandas import Series
+
if hash_key is None:
hash_key = _default_hash_key
if isinstance(obj, ABCMultiIndex):
- return Series(hash_tuples(obj, encoding, hash_key),
- dtype='uint64', copy=False)
+ return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
if isinstance(obj, ABCIndexClass):
- h = hash_array(obj.values, encoding, hash_key,
- categorize).astype('uint64', copy=False)
- h = Series(h, index=obj, dtype='uint64', copy=False)
+ h = hash_array(obj.values, encoding, hash_key, categorize).astype(
+ "uint64", copy=False
+ )
+ h = Series(h, index=obj, dtype="uint64", copy=False)
elif isinstance(obj, ABCSeries):
- h = hash_array(obj.values, encoding, hash_key,
- categorize).astype('uint64', copy=False)
+ h = hash_array(obj.values, encoding, hash_key, categorize).astype(
+ "uint64", copy=False
+ )
if index:
- index_iter = (hash_pandas_object(obj.index,
- index=False,
- encoding=encoding,
- hash_key=hash_key,
- categorize=categorize).values
- for _ in [None])
+ index_iter = (
+ hash_pandas_object(
+ obj.index,
+ index=False,
+ encoding=encoding,
+ hash_key=hash_key,
+ categorize=categorize,
+ ).values
+ for _ in [None]
+ )
arrays = itertools.chain([h], index_iter)
h = _combine_hash_arrays(arrays, 2)
- h = Series(h, index=obj.index, dtype='uint64', copy=False)
+ h = Series(h, index=obj.index, dtype="uint64", copy=False)
elif isinstance(obj, ABCDataFrame):
hashes = (hash_array(series.values) for _, series in obj.iteritems())
num_items = len(obj.columns)
if index:
- index_hash_generator = (hash_pandas_object(obj.index,
- index=False,
- encoding=encoding,
- hash_key=hash_key,
- categorize=categorize).values # noqa
- for _ in [None])
+ index_hash_generator = (
+ hash_pandas_object(
+ obj.index,
+ index=False,
+ encoding=encoding,
+ hash_key=hash_key,
+ categorize=categorize,
+ ).values # noqa
+ for _ in [None]
+ )
num_items += 1
hashes = itertools.chain(hashes, index_hash_generator)
h = _combine_hash_arrays(hashes, num_items)
- h = Series(h, index=obj.index, dtype='uint64', copy=False)
+ h = Series(h, index=obj.index, dtype="uint64", copy=False)
else:
raise TypeError("Unexpected type for hashing %s" % type(obj))
return h
-def hash_tuples(vals, encoding='utf8', hash_key=None):
+def hash_tuples(vals, encoding="utf8", hash_key=None):
"""
Hash an MultiIndex / list-of-tuples efficiently
@@ -147,17 +165,15 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
vals = MultiIndex.from_tuples(vals)
# create a list-of-Categoricals
- vals = [Categorical(vals.codes[level],
- vals.levels[level],
- ordered=False,
- fastpath=True)
- for level in range(vals.nlevels)]
+ vals = [
+ Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True)
+ for level in range(vals.nlevels)
+ ]
# hash the list-of-ndarrays
- hashes = (_hash_categorical(cat,
- encoding=encoding,
- hash_key=hash_key)
- for cat in vals)
+ hashes = (
+ _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals
+ )
h = _combine_hash_arrays(hashes, len(vals))
if is_tuple:
h = h[0]
@@ -165,7 +181,7 @@ def hash_tuples(vals, encoding='utf8', hash_key=None):
return h
-def hash_tuple(val, encoding='utf8', hash_key=None):
+def hash_tuple(val, encoding="utf8", hash_key=None):
"""
Hash a single tuple efficiently
@@ -180,8 +196,7 @@ def hash_tuple(val, encoding='utf8', hash_key=None):
hash
"""
- hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)
- for v in val)
+ hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) for v in val)
h = _combine_hash_arrays(hashes, len(val))[0]
@@ -205,8 +220,7 @@ def _hash_categorical(c, encoding, hash_key):
"""
# Convert ExtensionArrays to ndarrays
values = np.asarray(c.categories.values)
- hashed = hash_array(values, encoding, hash_key,
- categorize=False)
+ hashed = hash_array(values, encoding, hash_key, categorize=False)
# we have uint64, as we don't directly support missing values
# we don't want to use take_nd which will coerce to float
@@ -219,7 +233,7 @@ def _hash_categorical(c, encoding, hash_key):
if len(hashed):
result = hashed.take(c.codes)
else:
- result = np.zeros(len(mask), dtype='uint64')
+ result = np.zeros(len(mask), dtype="uint64")
if mask.any():
result[mask] = np.iinfo(np.uint64).max
@@ -227,7 +241,7 @@ def _hash_categorical(c, encoding, hash_key):
return result
-def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
+def hash_array(vals, encoding="utf8", hash_key=None, categorize=True):
"""
Given a 1d array, return an array of deterministic integers.
@@ -250,7 +264,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
1d uint64 numpy array of hash values, same length as the vals
"""
- if not hasattr(vals, 'dtype'):
+ if not hasattr(vals, "dtype"):
raise TypeError("must pass a ndarray-like")
dtype = vals.dtype
@@ -274,39 +288,40 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
elif isinstance(dtype, np.bool):
- vals = vals.astype('u8')
+ vals = vals.astype("u8")
elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
- vals = vals.view('i8').astype('u8', copy=False)
+ vals = vals.view("i8").astype("u8", copy=False)
elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
- vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
+ vals = vals.view("u{}".format(vals.dtype.itemsize)).astype("u8")
else:
# With repeated values, its MUCH faster to categorize object dtypes,
# then hash and rename categories. We allow skipping the categorization
# when the values are known/likely to be unique.
if categorize:
from pandas import factorize, Categorical, Index
+
codes, categories = factorize(vals, sort=False)
- cat = Categorical(codes, Index(categories),
- ordered=False, fastpath=True)
+ cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
return _hash_categorical(cat, encoding, hash_key)
try:
vals = hashing.hash_object_array(vals, hash_key, encoding)
except TypeError:
# we have mixed types
- vals = hashing.hash_object_array(vals.astype(str).astype(object),
- hash_key, encoding)
+ vals = hashing.hash_object_array(
+ vals.astype(str).astype(object), hash_key, encoding
+ )
# Then, redistribute these 64-bit ints within the space of 64-bit ints
vals ^= vals >> 30
- vals *= np.uint64(0xbf58476d1ce4e5b9)
+ vals *= np.uint64(0xBF58476D1CE4E5B9)
vals ^= vals >> 27
- vals *= np.uint64(0x94d049bb133111eb)
+ vals *= np.uint64(0x94D049BB133111EB)
vals ^= vals >> 31
return vals
-def _hash_scalar(val, encoding='utf8', hash_key=None):
+def _hash_scalar(val, encoding="utf8", hash_key=None):
"""
Hash scalar value
@@ -317,9 +332,9 @@ def _hash_scalar(val, encoding='utf8', hash_key=None):
if isna(val):
# this is to be consistent with the _hash_categorical implementation
- return np.array([np.iinfo(np.uint64).max], dtype='u8')
+ return np.array([np.iinfo(np.uint64).max], dtype="u8")
- if getattr(val, 'tzinfo', None) is not None:
+ if getattr(val, "tzinfo", None) is not None:
# for tz-aware datetimes, we need the underlying naive UTC value and
# not the tz aware object or pd extension type (as
# infer_dtype_from_scalar would do)
@@ -330,5 +345,4 @@ def _hash_scalar(val, encoding='utf8', hash_key=None):
dtype, val = infer_dtype_from_scalar(val)
vals = np.array([val], dtype=dtype)
- return hash_array(vals, hash_key=hash_key, encoding=encoding,
- categorize=False)
+ return hash_array(vals, hash_key=hash_key, encoding=encoding, categorize=False)
diff --git a/pandas/core/window.py b/pandas/core/window.py
index 8f888ba510b0e..27588249b1b3c 100644
--- a/pandas/core/window.py
+++ b/pandas/core/window.py
@@ -16,11 +16,24 @@
from pandas.util._decorators import Appender, Substitution, cache_readonly
from pandas.core.dtypes.common import (
- ensure_float64, is_bool, is_float_dtype, is_integer, is_integer_dtype,
- is_list_like, is_scalar, is_timedelta64_dtype, needs_i8_conversion)
+ ensure_float64,
+ is_bool,
+ is_float_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_list_like,
+ is_scalar,
+ is_timedelta64_dtype,
+ needs_i8_conversion,
+)
from pandas.core.dtypes.generic import (
- ABCDataFrame, ABCDateOffset, ABCDatetimeIndex, ABCPeriodIndex, ABCSeries,
- ABCTimedeltaIndex)
+ ABCDataFrame,
+ ABCDateOffset,
+ ABCDatetimeIndex,
+ ABCPeriodIndex,
+ ABCSeries,
+ ABCTimedeltaIndex,
+)
from pandas.core.base import DataError, PandasObject, SelectionMixin
import pandas.core.common as com
@@ -42,13 +55,29 @@
class _Window(PandasObject, SelectionMixin):
- _attributes = ['window', 'min_periods', 'center', 'win_type',
- 'axis', 'on', 'closed']
+ _attributes = [
+ "window",
+ "min_periods",
+ "center",
+ "win_type",
+ "axis",
+ "on",
+ "closed",
+ ]
exclusions = set() # type: Set[str]
- def __init__(self, obj, window=None, min_periods=None,
- center=False, win_type=None, axis=0, on=None, closed=None,
- **kwargs):
+ def __init__(
+ self,
+ obj,
+ window=None,
+ min_periods=None,
+ center=False,
+ win_type=None,
+ axis=0,
+ on=None,
+ closed=None,
+ **kwargs
+ ):
self.__dict__.update(kwargs)
self.blocks = []
@@ -77,18 +106,20 @@ def _on(self):
@property
def is_freq_type(self):
- return self.win_type == 'freq'
+ return self.win_type == "freq"
def validate(self):
if self.center is not None and not is_bool(self.center):
raise ValueError("center must be a boolean")
- if (self.min_periods is not None and
- not is_integer(self.min_periods)):
+ if self.min_periods is not None and not is_integer(self.min_periods):
raise ValueError("min_periods must be an integer")
- if (self.closed is not None and
- self.closed not in ['right', 'both', 'left', 'neither']):
- raise ValueError("closed must be 'right', 'left', 'both' or "
- "'neither'")
+ if self.closed is not None and self.closed not in [
+ "right",
+ "both",
+ "left",
+ "neither",
+ ]:
+ raise ValueError("closed must be 'right', 'left', 'both' or " "'neither'")
def _convert_freq(self):
"""
@@ -110,8 +141,7 @@ def _create_blocks(self):
# filter out the on from the object
if self.on is not None:
if obj.ndim == 2:
- obj = obj.reindex(columns=obj.columns.difference([self.on]),
- copy=False)
+ obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False)
blocks = obj._to_dict_of_blocks(copy=False).values()
return blocks, obj, index
@@ -145,8 +175,9 @@ def __getattr__(self, attr):
if attr in self.obj:
return self[attr]
- raise AttributeError("%r object has no attribute %r" %
- (type(self).__name__, attr))
+ raise AttributeError(
+ "%r object has no attribute %r" % (type(self).__name__, attr)
+ )
def _dir_additions(self):
return self.obj._dir_additions()
@@ -163,15 +194,18 @@ def __repr__(self):
Provide a nice str repr of our rolling object.
"""
- attrs = ("{k}={v}".format(k=k, v=getattr(self, k))
- for k in self._attributes
- if getattr(self, k, None) is not None)
- return "{klass} [{attrs}]".format(klass=self._window_type,
- attrs=','.join(attrs))
+ attrs = (
+ "{k}={v}".format(k=k, v=getattr(self, k))
+ for k in self._attributes
+ if getattr(self, k, None) is not None
+ )
+ return "{klass} [{attrs}]".format(
+ klass=self._window_type, attrs=",".join(attrs)
+ )
def __iter__(self):
- url = 'https://github.com/pandas-dev/pandas/issues/11704'
- raise NotImplementedError('See issue #11704 {url}'.format(url=url))
+ url = "https://github.com/pandas-dev/pandas/issues/11704"
+ raise NotImplementedError("See issue #11704 {url}".format(url=url))
def _get_index(self, index=None):
"""
@@ -191,7 +225,7 @@ def _get_index(self, index=None):
def _prep_values(self, values=None, kill_inf=True):
if values is None:
- values = getattr(self._selected_obj, 'values', self._selected_obj)
+ values = getattr(self._selected_obj, "values", self._selected_obj)
# GH #12373 : rolling functions error on float32 data
# make sure the data is coerced to float64
@@ -200,17 +234,18 @@ def _prep_values(self, values=None, kill_inf=True):
elif is_integer_dtype(values.dtype):
values = ensure_float64(values)
elif needs_i8_conversion(values.dtype):
- raise NotImplementedError("ops for {action} for this "
- "dtype {dtype} are not "
- "implemented".format(
- action=self._window_type,
- dtype=values.dtype))
+ raise NotImplementedError(
+ "ops for {action} for this "
+ "dtype {dtype} are not "
+ "implemented".format(action=self._window_type, dtype=values.dtype)
+ )
else:
try:
values = ensure_float64(values)
except (ValueError, TypeError):
- raise TypeError("cannot handle this type -> {0}"
- "".format(values.dtype))
+ raise TypeError(
+ "cannot handle this type -> {0}" "".format(values.dtype)
+ )
if kill_inf:
values = values.copy()
@@ -233,11 +268,14 @@ def _wrap_result(self, result, block=None, obj=None):
if block is not None:
if is_timedelta64_dtype(block.values.dtype):
from pandas import to_timedelta
- result = to_timedelta(
- result.ravel(), unit='ns').values.reshape(result.shape)
+
+ result = to_timedelta(result.ravel(), unit="ns").values.reshape(
+ result.shape
+ )
if result.ndim == 1:
from pandas import Series
+
return Series(result, index, name=obj.name)
return type(obj)(result, index=index, columns=block.columns)
@@ -291,10 +329,10 @@ def _wrap_results(self, results, blocks, obj, exclude=None):
columns = [c for c in columns if c not in exclude]
if not columns:
- raise DataError('No numeric types to aggregate')
+ raise DataError("No numeric types to aggregate")
if not len(final):
- return obj.astype('float64')
+ return obj.astype("float64")
return concat(final, axis=1).reindex(columns=columns, copy=False)
def _center_window(self, result, window):
@@ -302,8 +340,9 @@ def _center_window(self, result, window):
Center the result in the window.
"""
if self.axis > result.ndim - 1:
- raise ValueError("Requested axis is larger then no. of argument "
- "dimensions")
+ raise ValueError(
+ "Requested axis is larger then no. of argument " "dimensions"
+ )
offset = _offset(window, True)
if offset > 0:
@@ -323,7 +362,8 @@ def aggregate(self, func, *args, **kwargs):
agg = aggregate
- _shared_docs['sum'] = dedent("""
+ _shared_docs["sum"] = dedent(
+ """
Calculate %(name)s sum of given DataFrame or Series.
Parameters
@@ -396,9 +436,11 @@ def aggregate(self, func, *args, **kwargs):
2 6.0 14.0
3 9.0 29.0
4 12.0 50.0
- """)
+ """
+ )
- _shared_docs['mean'] = dedent("""
+ _shared_docs["mean"] = dedent(
+ """
Calculate the %(name)s mean of the values.
Parameters
@@ -440,7 +482,8 @@ def aggregate(self, func, *args, **kwargs):
2 2.0
3 3.0
dtype: float64
- """)
+ """
+ )
class Window(_Window):
@@ -605,17 +648,16 @@ def validate(self):
if window <= 0:
raise ValueError("window must be > 0 ")
import_optional_dependency(
- "scipy",
- extra="Scipy is required to generate window weight."
+ "scipy", extra="Scipy is required to generate window weight."
)
import scipy.signal as sig
if not isinstance(self.win_type, str):
- raise ValueError('Invalid win_type {0}'.format(self.win_type))
+ raise ValueError("Invalid win_type {0}".format(self.win_type))
if getattr(sig, self.win_type, None) is None:
- raise ValueError('Invalid win_type {0}'.format(self.win_type))
+ raise ValueError("Invalid win_type {0}".format(self.win_type))
else:
- raise ValueError('Invalid window {0}'.format(window))
+ raise ValueError("Invalid window {0}".format(window))
def _prep_window(self, **kwargs):
"""
@@ -631,16 +673,17 @@ def _prep_window(self, **kwargs):
# the below may pop from kwargs
def _validate_win_type(win_type, kwargs):
- arg_map = {'kaiser': ['beta'],
- 'gaussian': ['std'],
- 'general_gaussian': ['power', 'width'],
- 'slepian': ['width'],
- 'exponential': ['tau'],
- }
+ arg_map = {
+ "kaiser": ["beta"],
+ "gaussian": ["std"],
+ "general_gaussian": ["power", "width"],
+ "slepian": ["width"],
+ "exponential": ["tau"],
+ }
if win_type in arg_map:
win_args = _pop_args(win_type, arg_map[win_type], kwargs)
- if win_type == 'exponential':
+ if win_type == "exponential":
# exponential window requires the first arg (center)
# to be set to None (necessary for symmetric window)
win_args.insert(0, None)
@@ -650,7 +693,7 @@ def _validate_win_type(win_type, kwargs):
return win_type
def _pop_args(win_type, arg_names, kwargs):
- msg = '%s window requires %%s' % win_type
+ msg = "%s window requires %%s" % win_type
all_args = []
for n in arg_names:
if n not in kwargs:
@@ -694,7 +737,7 @@ def _apply_window(self, mean=True, **kwargs):
del block_list[i]
continue
else:
- raise DataError('No numeric types to aggregate')
+ raise DataError("No numeric types to aggregate")
if values.size == 0:
results.append(values.copy())
@@ -705,10 +748,12 @@ def _apply_window(self, mean=True, **kwargs):
def f(arg, *args, **kwargs):
minp = _use_window(self.min_periods, len(window))
- return libwindow.roll_window(np.concatenate((arg,
- additional_nans))
- if center else arg, window, minp,
- avg=mean)
+ return libwindow.roll_window(
+ np.concatenate((arg, additional_nans)) if center else arg,
+ window,
+ minp,
+ avg=mean,
+ )
result = np.apply_along_axis(f, self.axis, values)
@@ -718,14 +763,17 @@ def f(arg, *args, **kwargs):
return self._wrap_results(results, block_list, obj, exclude)
- _agg_see_also_doc = dedent("""
+ _agg_see_also_doc = dedent(
+ """
See Also
--------
pandas.DataFrame.rolling.aggregate
pandas.DataFrame.aggregate
- """)
+ """
+ )
- _agg_examples_doc = dedent("""
+ _agg_examples_doc = dedent(
+ """
Examples
--------
@@ -755,14 +803,17 @@ def f(arg, *args, **kwargs):
7 0.906020 1.283573 0.085482
8 -0.096361 0.818139 0.472290
9 0.070889 0.134399 -0.031308
- """)
-
- @Substitution(see_also=_agg_see_also_doc,
- examples=_agg_examples_doc,
- versionadded='',
- klass='Series/DataFrame',
- axis='')
- @Appender(_shared_docs['aggregate'])
+ """
+ )
+
+ @Substitution(
+ see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded="",
+ klass="Series/DataFrame",
+ axis="",
+ )
+ @Appender(_shared_docs["aggregate"])
def aggregate(self, arg, *args, **kwargs):
result, how = self._aggregate(arg, *args, **kwargs)
if result is None:
@@ -774,16 +825,16 @@ def aggregate(self, arg, *args, **kwargs):
agg = aggregate
- @Substitution(name='window')
- @Appender(_shared_docs['sum'])
+ @Substitution(name="window")
+ @Appender(_shared_docs["sum"])
def sum(self, *args, **kwargs):
- nv.validate_window_func('sum', args, kwargs)
+ nv.validate_window_func("sum", args, kwargs)
return self._apply_window(mean=False, **kwargs)
- @Substitution(name='window')
- @Appender(_shared_docs['mean'])
+ @Substitution(name="window")
+ @Appender(_shared_docs["mean"])
def mean(self, *args, **kwargs):
- nv.validate_window_func('mean', args, kwargs)
+ nv.validate_window_func("mean", args, kwargs)
return self._apply_window(mean=True, **kwargs)
@@ -793,8 +844,8 @@ class _GroupByMixin(GroupByMixin):
"""
def __init__(self, obj, *args, **kwargs):
- parent = kwargs.pop('parent', None) # noqa
- groupby = kwargs.pop('groupby', None)
+ parent = kwargs.pop("parent", None) # noqa
+ groupby = kwargs.pop("groupby", None)
if groupby is None:
groupby, obj = obj, obj.obj
self._groupby = groupby
@@ -802,12 +853,13 @@ def __init__(self, obj, *args, **kwargs):
self._groupby.grouper.mutated = True
super().__init__(obj, *args, **kwargs)
- count = GroupByMixin._dispatch('count')
- corr = GroupByMixin._dispatch('corr', other=None, pairwise=None)
- cov = GroupByMixin._dispatch('cov', other=None, pairwise=None)
+ count = GroupByMixin._dispatch("count")
+ corr = GroupByMixin._dispatch("corr", other=None, pairwise=None)
+ cov = GroupByMixin._dispatch("cov", other=None, pairwise=None)
- def _apply(self, func, name=None, window=None, center=None,
- check_minp=None, **kwargs):
+ def _apply(
+ self, func, name=None, window=None, center=None, check_minp=None, **kwargs
+ ):
"""
Dispatch to apply; we are stripping all of the _apply kwargs and
performing the original function call on the grouped object.
@@ -825,13 +877,13 @@ def f(x, name=name, *args):
class _Rolling(_Window):
-
@property
def _constructor(self):
return Rolling
- def _apply(self, func, name=None, window=None, center=None,
- check_minp=None, **kwargs):
+ def _apply(
+ self, func, name=None, window=None, center=None, check_minp=None, **kwargs
+ ):
"""
Rolling statistical measure using supplied function.
@@ -874,7 +926,7 @@ def _apply(self, func, name=None, window=None, center=None,
del block_list[i]
continue
else:
- raise DataError('No numeric types to aggregate')
+ raise DataError("No numeric types to aggregate")
if values.size == 0:
results.append(values.copy())
@@ -884,15 +936,16 @@ def _apply(self, func, name=None, window=None, center=None,
if isinstance(func, str):
cfunc = getattr(libwindow, func, None)
if cfunc is None:
- raise ValueError("we do not support this function "
- "in libwindow.{func}".format(func=func))
+ raise ValueError(
+ "we do not support this function "
+ "in libwindow.{func}".format(func=func)
+ )
def func(arg, window, min_periods=None, closed=None):
minp = check_minp(min_periods, window)
# ensure we are only rolling on floats
arg = ensure_float64(arg)
- return cfunc(arg,
- window, minp, indexi, closed, **kwargs)
+ return cfunc(arg, window, minp, indexi, closed, **kwargs)
# calculation function
if center:
@@ -900,16 +953,21 @@ def func(arg, window, min_periods=None, closed=None):
additional_nans = np.array([np.NaN] * offset)
def calc(x):
- return func(np.concatenate((x, additional_nans)),
- window, min_periods=self.min_periods,
- closed=self.closed)
+ return func(
+ np.concatenate((x, additional_nans)),
+ window,
+ min_periods=self.min_periods,
+ closed=self.closed,
+ )
+
else:
def calc(x):
- return func(x, window, min_periods=self.min_periods,
- closed=self.closed)
+ return func(
+ x, window, min_periods=self.min_periods, closed=self.closed
+ )
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
if values.ndim > 1:
result = np.apply_along_axis(calc, self.axis, values)
else:
@@ -925,7 +983,8 @@ def calc(x):
class _Rolling_and_Expanding(_Rolling):
- _shared_docs['count'] = dedent(r"""
+ _shared_docs["count"] = dedent(
+ r"""
The %(name)s count of any non-NaN observations inside the window.
Returns
@@ -961,7 +1020,8 @@ class _Rolling_and_Expanding(_Rolling):
2 2.0
3 3.0
dtype: float64
- """)
+ """
+ )
def count(self):
@@ -975,15 +1035,20 @@ def count(self):
results = []
for b in blocks:
result = b.notna().astype(int)
- result = self._constructor(result, window=window, min_periods=0,
- center=self.center,
- axis=self.axis,
- closed=self.closed).sum()
+ result = self._constructor(
+ result,
+ window=window,
+ min_periods=0,
+ center=self.center,
+ axis=self.axis,
+ closed=self.closed,
+ ).sum()
results.append(result)
return self._wrap_results(results, blocks, obj)
- _shared_docs['apply'] = dedent(r"""
+ _shared_docs["apply"] = dedent(
+ r"""
The %(name)s function's apply function.
Parameters
@@ -1015,13 +1080,14 @@ def count(self):
--------
Series.%(name)s : Series %(name)s.
DataFrame.%(name)s : DataFrame %(name)s.
- """)
+ """
+ )
def apply(self, func, raw=None, args=(), kwargs={}):
from pandas import Series
# TODO: _level is unused?
- _level = kwargs.pop('_level', None) # noqa
+ _level = kwargs.pop("_level", None) # noqa
window = self._get_window()
offset = _offset(window, self.center)
index, indexi = self._get_index()
@@ -1034,7 +1100,10 @@ def apply(self, func, raw=None, args=(), kwargs={}):
"applied function. In the future, this will change to passing "
"it as Series objects. You need to specify 'raw=True' to keep "
"the current behaviour, and you can pass 'raw=False' to "
- "silence this warning", FutureWarning, stacklevel=3)
+ "silence this warning",
+ FutureWarning,
+ stacklevel=3,
+ )
raw = True
def f(arg, window, min_periods, closed):
@@ -1042,30 +1111,32 @@ def f(arg, window, min_periods, closed):
if not raw:
arg = Series(arg, index=self.obj.index)
return libwindow.roll_generic(
- arg, window, minp, indexi,
- closed, offset, func, raw, args, kwargs)
+ arg, window, minp, indexi, closed, offset, func, raw, args, kwargs
+ )
- return self._apply(f, func, args=args, kwargs=kwargs,
- center=False, raw=raw)
+ return self._apply(f, func, args=args, kwargs=kwargs, center=False, raw=raw)
def sum(self, *args, **kwargs):
- nv.validate_window_func('sum', args, kwargs)
- return self._apply('roll_sum', 'sum', **kwargs)
+ nv.validate_window_func("sum", args, kwargs)
+ return self._apply("roll_sum", "sum", **kwargs)
- _shared_docs['max'] = dedent("""
+ _shared_docs["max"] = dedent(
+ """
Calculate the %(name)s maximum.
Parameters
----------
*args, **kwargs
Arguments and keyword arguments to be passed into func.
- """)
+ """
+ )
def max(self, *args, **kwargs):
- nv.validate_window_func('max', args, kwargs)
- return self._apply('roll_max', 'max', **kwargs)
+ nv.validate_window_func("max", args, kwargs)
+ return self._apply("roll_max", "max", **kwargs)
- _shared_docs['min'] = dedent("""
+ _shared_docs["min"] = dedent(
+ """
Calculate the %(name)s minimum.
Parameters
@@ -1098,17 +1169,19 @@ def max(self, *args, **kwargs):
3 2.0
4 2.0
dtype: float64
- """)
+ """
+ )
def min(self, *args, **kwargs):
- nv.validate_window_func('min', args, kwargs)
- return self._apply('roll_min', 'min', **kwargs)
+ nv.validate_window_func("min", args, kwargs)
+ return self._apply("roll_min", "min", **kwargs)
def mean(self, *args, **kwargs):
- nv.validate_window_func('mean', args, kwargs)
- return self._apply('roll_mean', 'mean', **kwargs)
+ nv.validate_window_func("mean", args, kwargs)
+ return self._apply("roll_mean", "mean", **kwargs)
- _shared_docs['median'] = dedent("""
+ _shared_docs["median"] = dedent(
+ """
Calculate the %(name)s median.
Parameters
@@ -1141,12 +1214,14 @@ def mean(self, *args, **kwargs):
3 2.0
4 3.0
dtype: float64
- """)
+ """
+ )
def median(self, **kwargs):
- return self._apply('roll_median_c', 'median', **kwargs)
+ return self._apply("roll_median_c", "median", **kwargs)
- _shared_docs['std'] = dedent("""
+ _shared_docs["std"] = dedent(
+ """
Calculate %(name)s standard deviation.
Normalized by N-1 by default. This can be changed using the `ddof`
@@ -1202,22 +1277,26 @@ def median(self, **kwargs):
5 0.836660
6 0.786796
dtype: float64
- """)
+ """
+ )
def std(self, ddof=1, *args, **kwargs):
- nv.validate_window_func('std', args, kwargs)
+ nv.validate_window_func("std", args, kwargs)
window = self._get_window()
index, indexi = self._get_index()
def f(arg, *args, **kwargs):
minp = _require_min_periods(1)(self.min_periods, window)
- return _zsqrt(libwindow.roll_var(arg, window, minp, indexi,
- self.closed, ddof))
+ return _zsqrt(
+ libwindow.roll_var(arg, window, minp, indexi, self.closed, ddof)
+ )
- return self._apply(f, 'std', check_minp=_require_min_periods(1),
- ddof=ddof, **kwargs)
+ return self._apply(
+ f, "std", check_minp=_require_min_periods(1), ddof=ddof, **kwargs
+ )
- _shared_docs['var'] = dedent("""
+ _shared_docs["var"] = dedent(
+ """
Calculate unbiased %(name)s variance.
Normalized by N-1 by default. This can be changed using the `ddof`
@@ -1273,15 +1352,18 @@ def f(arg, *args, **kwargs):
5 0.700000
6 0.619048
dtype: float64
- """)
+ """
+ )
def var(self, ddof=1, *args, **kwargs):
- nv.validate_window_func('var', args, kwargs)
- return self._apply('roll_var', 'var',
- check_minp=_require_min_periods(1), ddof=ddof,
- **kwargs)
-
- _shared_docs['skew'] = """
+ nv.validate_window_func("var", args, kwargs)
+ return self._apply(
+ "roll_var", "var", check_minp=_require_min_periods(1), ddof=ddof, **kwargs
+ )
+
+ _shared_docs[
+ "skew"
+ ] = """
Unbiased %(name)s skewness.
Parameters
@@ -1291,10 +1373,12 @@ def var(self, ddof=1, *args, **kwargs):
"""
def skew(self, **kwargs):
- return self._apply('roll_skew', 'skew',
- check_minp=_require_min_periods(3), **kwargs)
+ return self._apply(
+ "roll_skew", "skew", check_minp=_require_min_periods(3), **kwargs
+ )
- _shared_docs['kurt'] = dedent("""
+ _shared_docs["kurt"] = dedent(
+ """
Calculate unbiased %(name)s kurtosis.
This function uses Fisher's definition of kurtosis without bias.
@@ -1322,13 +1406,16 @@ def skew(self, **kwargs):
Notes
-----
A minimum of 4 periods is required for the %(name)s calculation.
- """)
+ """
+ )
def kurt(self, **kwargs):
- return self._apply('roll_kurt', 'kurt',
- check_minp=_require_min_periods(4), **kwargs)
+ return self._apply(
+ "roll_kurt", "kurt", check_minp=_require_min_periods(4), **kwargs
+ )
- _shared_docs['quantile'] = dedent("""
+ _shared_docs["quantile"] = dedent(
+ """
Calculate the %(name)s quantile.
Parameters
@@ -1380,29 +1467,29 @@ def kurt(self, **kwargs):
2 2.5
3 3.5
dtype: float64
- """)
+ """
+ )
- def quantile(self, quantile, interpolation='linear', **kwargs):
+ def quantile(self, quantile, interpolation="linear", **kwargs):
window = self._get_window()
index, indexi = self._get_index()
def f(arg, *args, **kwargs):
minp = _use_window(self.min_periods, window)
if quantile == 1.0:
- return libwindow.roll_max(arg, window, minp, indexi,
- self.closed)
+ return libwindow.roll_max(arg, window, minp, indexi, self.closed)
elif quantile == 0.0:
- return libwindow.roll_min(arg, window, minp, indexi,
- self.closed)
+ return libwindow.roll_min(arg, window, minp, indexi, self.closed)
else:
- return libwindow.roll_quantile(arg, window, minp, indexi,
- self.closed, quantile,
- interpolation)
+ return libwindow.roll_quantile(
+ arg, window, minp, indexi, self.closed, quantile, interpolation
+ )
- return self._apply(f, 'quantile', quantile=quantile,
- **kwargs)
+ return self._apply(f, "quantile", quantile=quantile, **kwargs)
- _shared_docs['cov'] = """
+ _shared_docs[
+ "cov"
+ ] = """
Calculate the %(name)s sample covariance.
Parameters
@@ -1440,19 +1527,21 @@ def cov(self, other=None, pairwise=None, ddof=1, **kwargs):
def _get_cov(X, Y):
# GH #12373 : rolling functions error on float32 data
# to avoid potential overflow, cast the data to float64
- X = X.astype('float64')
- Y = Y.astype('float64')
- mean = lambda x: x.rolling(window, self.min_periods,
- center=self.center).mean(**kwargs)
- count = (X + Y).rolling(window=window,
- center=self.center).count(**kwargs)
+ X = X.astype("float64")
+ Y = Y.astype("float64")
+ mean = lambda x: x.rolling(
+ window, self.min_periods, center=self.center
+ ).mean(**kwargs)
+ count = (X + Y).rolling(window=window, center=self.center).count(**kwargs)
bias_adj = count / (count - ddof)
return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj
- return _flex_binary_moment(self._selected_obj, other._selected_obj,
- _get_cov, pairwise=bool(pairwise))
+ return _flex_binary_moment(
+ self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise)
+ )
- _shared_docs['corr'] = dedent("""
+ _shared_docs["corr"] = dedent(
+ """
Calculate %(name)s correlation.
Parameters
@@ -1559,7 +1648,8 @@ def _get_cov(X, Y):
Y 0.626300 1.000000
4 X 1.000000 0.555368
Y 0.555368 1.000000
- """)
+ """
+ )
def corr(self, other=None, pairwise=None, **kwargs):
if other is None:
@@ -1570,60 +1660,66 @@ def corr(self, other=None, pairwise=None, **kwargs):
window = self._get_window(other)
def _get_corr(a, b):
- a = a.rolling(window=window, min_periods=self.min_periods,
- center=self.center)
- b = b.rolling(window=window, min_periods=self.min_periods,
- center=self.center)
+ a = a.rolling(
+ window=window, min_periods=self.min_periods, center=self.center
+ )
+ b = b.rolling(
+ window=window, min_periods=self.min_periods, center=self.center
+ )
return a.cov(b, **kwargs) / (a.std(**kwargs) * b.std(**kwargs))
- return _flex_binary_moment(self._selected_obj, other._selected_obj,
- _get_corr, pairwise=bool(pairwise))
+ return _flex_binary_moment(
+ self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise)
+ )
class Rolling(_Rolling_and_Expanding):
-
@cache_readonly
def is_datetimelike(self):
- return isinstance(self._on,
- (ABCDatetimeIndex,
- ABCTimedeltaIndex,
- ABCPeriodIndex))
+ return isinstance(
+ self._on, (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex)
+ )
@cache_readonly
def _on(self):
if self.on is None:
return self.obj.index
- elif (isinstance(self.obj, ABCDataFrame) and
- self.on in self.obj.columns):
+ elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns:
from pandas import Index
+
return Index(self.obj[self.on])
else:
- raise ValueError("invalid on specified as {0}, "
- "must be a column (if DataFrame) "
- "or None".format(self.on))
+ raise ValueError(
+ "invalid on specified as {0}, "
+ "must be a column (if DataFrame) "
+ "or None".format(self.on)
+ )
def validate(self):
super().validate()
# we allow rolling on a datetimelike index
- if ((self.obj.empty or self.is_datetimelike) and
- isinstance(self.window, (str, ABCDateOffset, timedelta))):
+ if (self.obj.empty or self.is_datetimelike) and isinstance(
+ self.window, (str, ABCDateOffset, timedelta)
+ ):
self._validate_monotonic()
freq = self._validate_freq()
# we don't allow center
if self.center:
- raise NotImplementedError("center is not implemented "
- "for datetimelike and offset "
- "based windows")
+ raise NotImplementedError(
+ "center is not implemented "
+ "for datetimelike and offset "
+ "based windows"
+ )
# this will raise ValueError on non-fixed freqs
self.win_freq = self.window
self.window = freq.nanos
- self.win_type = 'freq'
+ self.win_type = "freq"
# min_periods must be an integer
if self.min_periods is None:
@@ -1635,38 +1731,44 @@ def validate(self):
raise ValueError("window must be non-negative")
if not self.is_datetimelike and self.closed is not None:
- raise ValueError("closed only implemented for datetimelike "
- "and offset based windows")
+ raise ValueError(
+ "closed only implemented for datetimelike " "and offset based windows"
+ )
def _validate_monotonic(self):
"""
Validate on is_monotonic.
"""
if not self._on.is_monotonic:
- formatted = self.on or 'index'
- raise ValueError("{0} must be "
- "monotonic".format(formatted))
+ formatted = self.on or "index"
+ raise ValueError("{0} must be " "monotonic".format(formatted))
def _validate_freq(self):
"""
Validate & return window frequency.
"""
from pandas.tseries.frequencies import to_offset
+
try:
return to_offset(self.window)
except (TypeError, ValueError):
- raise ValueError("passed window {0} is not "
- "compatible with a datetimelike "
- "index".format(self.window))
+ raise ValueError(
+ "passed window {0} is not "
+ "compatible with a datetimelike "
+ "index".format(self.window)
+ )
- _agg_see_also_doc = dedent("""
+ _agg_see_also_doc = dedent(
+ """
See Also
--------
Series.rolling
DataFrame.rolling
- """)
+ """
+ )
- _agg_examples_doc = dedent("""
+ _agg_examples_doc = dedent(
+ """
Examples
--------
@@ -1709,83 +1811,87 @@ def _validate_freq(self):
7 2.718061 -1.647453
8 -0.289082 -1.647453
9 0.212668 -1.647453
- """)
-
- @Substitution(see_also=_agg_see_also_doc,
- examples=_agg_examples_doc,
- versionadded='',
- klass='Series/Dataframe',
- axis='')
- @Appender(_shared_docs['aggregate'])
+ """
+ )
+
+ @Substitution(
+ see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded="",
+ klass="Series/Dataframe",
+ axis="",
+ )
+ @Appender(_shared_docs["aggregate"])
def aggregate(self, arg, *args, **kwargs):
return super().aggregate(arg, *args, **kwargs)
agg = aggregate
- @Substitution(name='rolling')
- @Appender(_shared_docs['count'])
+ @Substitution(name="rolling")
+ @Appender(_shared_docs["count"])
def count(self):
# different impl for freq counting
if self.is_freq_type:
- return self._apply('roll_count', 'count')
+ return self._apply("roll_count", "count")
return super().count()
- @Substitution(name='rolling')
- @Appender(_shared_docs['apply'])
+ @Substitution(name="rolling")
+ @Appender(_shared_docs["apply"])
def apply(self, func, raw=None, args=(), kwargs={}):
return super().apply(func, raw=raw, args=args, kwargs=kwargs)
- @Substitution(name='rolling')
- @Appender(_shared_docs['sum'])
+ @Substitution(name="rolling")
+ @Appender(_shared_docs["sum"])
def sum(self, *args, **kwargs):
- nv.validate_rolling_func('sum', args, kwargs)
+ nv.validate_rolling_func("sum", args, kwargs)
return super().sum(*args, **kwargs)
- @Substitution(name='rolling')
+ @Substitution(name="rolling")
@Appender(_doc_template)
- @Appender(_shared_docs['max'])
+ @Appender(_shared_docs["max"])
def max(self, *args, **kwargs):
- nv.validate_rolling_func('max', args, kwargs)
+ nv.validate_rolling_func("max", args, kwargs)
return super().max(*args, **kwargs)
- @Substitution(name='rolling')
- @Appender(_shared_docs['min'])
+ @Substitution(name="rolling")
+ @Appender(_shared_docs["min"])
def min(self, *args, **kwargs):
- nv.validate_rolling_func('min', args, kwargs)
+ nv.validate_rolling_func("min", args, kwargs)
return super().min(*args, **kwargs)
- @Substitution(name='rolling')
- @Appender(_shared_docs['mean'])
+ @Substitution(name="rolling")
+ @Appender(_shared_docs["mean"])
def mean(self, *args, **kwargs):
- nv.validate_rolling_func('mean', args, kwargs)
+ nv.validate_rolling_func("mean", args, kwargs)
return super().mean(*args, **kwargs)
- @Substitution(name='rolling')
- @Appender(_shared_docs['median'])
+ @Substitution(name="rolling")
+ @Appender(_shared_docs["median"])
def median(self, **kwargs):
return super().median(**kwargs)
- @Substitution(name='rolling')
- @Appender(_shared_docs['std'])
+ @Substitution(name="rolling")
+ @Appender(_shared_docs["std"])
def std(self, ddof=1, *args, **kwargs):
- nv.validate_rolling_func('std', args, kwargs)
+ nv.validate_rolling_func("std", args, kwargs)
return super().std(ddof=ddof, **kwargs)
- @Substitution(name='rolling')
- @Appender(_shared_docs['var'])
+ @Substitution(name="rolling")
+ @Appender(_shared_docs["var"])
def var(self, ddof=1, *args, **kwargs):
- nv.validate_rolling_func('var', args, kwargs)
+ nv.validate_rolling_func("var", args, kwargs)
return super().var(ddof=ddof, **kwargs)
- @Substitution(name='rolling')
+ @Substitution(name="rolling")
@Appender(_doc_template)
- @Appender(_shared_docs['skew'])
+ @Appender(_shared_docs["skew"])
def skew(self, **kwargs):
return super().skew(**kwargs)
- _agg_doc = dedent("""
+ _agg_doc = dedent(
+ """
Examples
--------
@@ -1807,28 +1913,30 @@ def skew(self, **kwargs):
3 -1.200000
4 3.999946
dtype: float64
- """)
+ """
+ )
@Appender(_agg_doc)
- @Substitution(name='rolling')
- @Appender(_shared_docs['kurt'])
+ @Substitution(name="rolling")
+ @Appender(_shared_docs["kurt"])
def kurt(self, **kwargs):
return super().kurt(**kwargs)
- @Substitution(name='rolling')
- @Appender(_shared_docs['quantile'])
- def quantile(self, quantile, interpolation='linear', **kwargs):
- return super().quantile(quantile=quantile, interpolation=interpolation,
- **kwargs)
+ @Substitution(name="rolling")
+ @Appender(_shared_docs["quantile"])
+ def quantile(self, quantile, interpolation="linear", **kwargs):
+ return super().quantile(
+ quantile=quantile, interpolation=interpolation, **kwargs
+ )
- @Substitution(name='rolling')
+ @Substitution(name="rolling")
@Appender(_doc_template)
- @Appender(_shared_docs['cov'])
+ @Appender(_shared_docs["cov"])
def cov(self, other=None, pairwise=None, ddof=1, **kwargs):
return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs)
- @Substitution(name='rolling')
- @Appender(_shared_docs['corr'])
+ @Substitution(name="rolling")
+ @Appender(_shared_docs["corr"])
def corr(self, other=None, pairwise=None, **kwargs):
return super().corr(other=other, pairwise=pairwise, **kwargs)
@@ -1840,6 +1948,7 @@ class RollingGroupby(_GroupByMixin, Rolling):
.. versionadded:: 0.18.1
"""
+
@property
def _constructor(self):
return Rolling
@@ -1913,12 +2022,10 @@ class Expanding(_Rolling_and_Expanding):
4 7.0
"""
- _attributes = ['min_periods', 'center', 'axis']
+ _attributes = ["min_periods", "center", "axis"]
- def __init__(self, obj, min_periods=1, center=False, axis=0,
- **kwargs):
- super().__init__(obj=obj, min_periods=min_periods, center=center,
- axis=axis)
+ def __init__(self, obj, min_periods=1, center=False, axis=0, **kwargs):
+ super().__init__(obj=obj, min_periods=min_periods, center=center, axis=axis)
@property
def _constructor(self):
@@ -1945,15 +2052,18 @@ def _get_window(self, other=None):
other = self.min_periods or -1
return max(length, other)
- _agg_see_also_doc = dedent("""
+ _agg_see_also_doc = dedent(
+ """
See Also
--------
DataFrame.expanding.aggregate
DataFrame.rolling.aggregate
DataFrame.aggregate
- """)
+ """
+ )
- _agg_examples_doc = dedent("""
+ _agg_examples_doc = dedent(
+ """
Examples
--------
@@ -1983,79 +2093,82 @@ def _get_window(self, other=None):
7 0.680292 0.132049 0.548693
8 0.067236 0.948257 0.163353
9 -0.286980 0.618493 -0.694496
- """)
-
- @Substitution(see_also=_agg_see_also_doc,
- examples=_agg_examples_doc,
- versionadded='',
- klass='Series/Dataframe',
- axis='')
- @Appender(_shared_docs['aggregate'])
+ """
+ )
+
+ @Substitution(
+ see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded="",
+ klass="Series/Dataframe",
+ axis="",
+ )
+ @Appender(_shared_docs["aggregate"])
def aggregate(self, arg, *args, **kwargs):
return super().aggregate(arg, *args, **kwargs)
agg = aggregate
- @Substitution(name='expanding')
- @Appender(_shared_docs['count'])
+ @Substitution(name="expanding")
+ @Appender(_shared_docs["count"])
def count(self, **kwargs):
return super().count(**kwargs)
- @Substitution(name='expanding')
- @Appender(_shared_docs['apply'])
+ @Substitution(name="expanding")
+ @Appender(_shared_docs["apply"])
def apply(self, func, raw=None, args=(), kwargs={}):
- return super().apply(
- func, raw=raw, args=args, kwargs=kwargs)
+ return super().apply(func, raw=raw, args=args, kwargs=kwargs)
- @Substitution(name='expanding')
- @Appender(_shared_docs['sum'])
+ @Substitution(name="expanding")
+ @Appender(_shared_docs["sum"])
def sum(self, *args, **kwargs):
- nv.validate_expanding_func('sum', args, kwargs)
+ nv.validate_expanding_func("sum", args, kwargs)
return super().sum(*args, **kwargs)
- @Substitution(name='expanding')
+ @Substitution(name="expanding")
@Appender(_doc_template)
- @Appender(_shared_docs['max'])
+ @Appender(_shared_docs["max"])
def max(self, *args, **kwargs):
- nv.validate_expanding_func('max', args, kwargs)
+ nv.validate_expanding_func("max", args, kwargs)
return super().max(*args, **kwargs)
- @Substitution(name='expanding')
- @Appender(_shared_docs['min'])
+ @Substitution(name="expanding")
+ @Appender(_shared_docs["min"])
def min(self, *args, **kwargs):
- nv.validate_expanding_func('min', args, kwargs)
+ nv.validate_expanding_func("min", args, kwargs)
return super().min(*args, **kwargs)
- @Substitution(name='expanding')
- @Appender(_shared_docs['mean'])
+ @Substitution(name="expanding")
+ @Appender(_shared_docs["mean"])
def mean(self, *args, **kwargs):
- nv.validate_expanding_func('mean', args, kwargs)
+ nv.validate_expanding_func("mean", args, kwargs)
return super().mean(*args, **kwargs)
- @Substitution(name='expanding')
- @Appender(_shared_docs['median'])
+ @Substitution(name="expanding")
+ @Appender(_shared_docs["median"])
def median(self, **kwargs):
return super().median(**kwargs)
- @Substitution(name='expanding')
- @Appender(_shared_docs['std'])
+ @Substitution(name="expanding")
+ @Appender(_shared_docs["std"])
def std(self, ddof=1, *args, **kwargs):
- nv.validate_expanding_func('std', args, kwargs)
+ nv.validate_expanding_func("std", args, kwargs)
return super().std(ddof=ddof, **kwargs)
- @Substitution(name='expanding')
- @Appender(_shared_docs['var'])
+ @Substitution(name="expanding")
+ @Appender(_shared_docs["var"])
def var(self, ddof=1, *args, **kwargs):
- nv.validate_expanding_func('var', args, kwargs)
+ nv.validate_expanding_func("var", args, kwargs)
return super().var(ddof=ddof, **kwargs)
- @Substitution(name='expanding')
+ @Substitution(name="expanding")
@Appender(_doc_template)
- @Appender(_shared_docs['skew'])
+ @Appender(_shared_docs["skew"])
def skew(self, **kwargs):
return super().skew(**kwargs)
- _agg_doc = dedent("""
+ _agg_doc = dedent(
+ """
Examples
--------
@@ -2077,29 +2190,30 @@ def skew(self, **kwargs):
3 -1.200000
4 4.999874
dtype: float64
- """)
+ """
+ )
@Appender(_agg_doc)
- @Substitution(name='expanding')
- @Appender(_shared_docs['kurt'])
+ @Substitution(name="expanding")
+ @Appender(_shared_docs["kurt"])
def kurt(self, **kwargs):
return super().kurt(**kwargs)
- @Substitution(name='expanding')
- @Appender(_shared_docs['quantile'])
- def quantile(self, quantile, interpolation='linear', **kwargs):
- return super().quantile(quantile=quantile,
- interpolation=interpolation,
- **kwargs)
+ @Substitution(name="expanding")
+ @Appender(_shared_docs["quantile"])
+ def quantile(self, quantile, interpolation="linear", **kwargs):
+ return super().quantile(
+ quantile=quantile, interpolation=interpolation, **kwargs
+ )
- @Substitution(name='expanding')
+ @Substitution(name="expanding")
@Appender(_doc_template)
- @Appender(_shared_docs['cov'])
+ @Appender(_shared_docs["cov"])
def cov(self, other=None, pairwise=None, ddof=1, **kwargs):
return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs)
- @Substitution(name='expanding')
- @Appender(_shared_docs['corr'])
+ @Substitution(name="expanding")
+ @Appender(_shared_docs["corr"])
def corr(self, other=None, pairwise=None, **kwargs):
return super().corr(other=other, pairwise=pairwise, **kwargs)
@@ -2111,6 +2225,7 @@ class ExpandingGroupby(_GroupByMixin, Expanding):
.. versionadded:: 0.18.1
"""
+
@property
def _constructor(self):
return Expanding
@@ -2239,11 +2354,20 @@ class EWM(_Rolling):
3 1.615385
4 3.670213
"""
- _attributes = ['com', 'min_periods', 'adjust', 'ignore_na', 'axis']
-
- def __init__(self, obj, com=None, span=None, halflife=None, alpha=None,
- min_periods=0, adjust=True, ignore_na=False,
- axis=0):
+ _attributes = ["com", "min_periods", "adjust", "ignore_na", "axis"]
+
+ def __init__(
+ self,
+ obj,
+ com=None,
+ span=None,
+ halflife=None,
+ alpha=None,
+ min_periods=0,
+ adjust=True,
+ ignore_na=False,
+ axis=0,
+ ):
self.obj = obj
self.com = _get_center_of_mass(com, span, halflife, alpha)
self.min_periods = min_periods
@@ -2256,13 +2380,16 @@ def __init__(self, obj, com=None, span=None, halflife=None, alpha=None,
def _constructor(self):
return EWM
- _agg_see_also_doc = dedent("""
+ _agg_see_also_doc = dedent(
+ """
See Also
--------
pandas.DataFrame.rolling.aggregate
- """)
+ """
+ )
- _agg_examples_doc = dedent("""
+ _agg_examples_doc = dedent(
+ """
Examples
--------
@@ -2292,14 +2419,17 @@ def _constructor(self):
7 0.680292 0.132049 0.548693
8 0.067236 0.948257 0.163353
9 -0.286980 0.618493 -0.694496
- """)
-
- @Substitution(see_also=_agg_see_also_doc,
- examples=_agg_examples_doc,
- versionadded='',
- klass='Series/Dataframe',
- axis='')
- @Appender(_shared_docs['aggregate'])
+ """
+ )
+
+ @Substitution(
+ see_also=_agg_see_also_doc,
+ examples=_agg_examples_doc,
+ versionadded="",
+ klass="Series/Dataframe",
+ axis="",
+ )
+ @Appender(_shared_docs["aggregate"])
def aggregate(self, arg, *args, **kwargs):
return super().aggregate(arg, *args, **kwargs)
@@ -2333,7 +2463,7 @@ def _apply(self, func, **kwargs):
del block_list[i]
continue
else:
- raise DataError('No numeric types to aggregate')
+ raise DataError("No numeric types to aggregate")
if values.size == 0:
results.append(values.copy())
@@ -2343,18 +2473,25 @@ def _apply(self, func, **kwargs):
if isinstance(func, str):
cfunc = getattr(libwindow, func, None)
if cfunc is None:
- raise ValueError("we do not support this function "
- "in libwindow.{func}".format(func=func))
+ raise ValueError(
+ "we do not support this function "
+ "in libwindow.{func}".format(func=func)
+ )
def func(arg):
- return cfunc(arg, self.com, int(self.adjust),
- int(self.ignore_na), int(self.min_periods))
+ return cfunc(
+ arg,
+ self.com,
+ int(self.adjust),
+ int(self.ignore_na),
+ int(self.min_periods),
+ )
results.append(np.apply_along_axis(func, self.axis, values))
return self._wrap_results(results, block_list, obj, exclude)
- @Substitution(name='ewm')
+ @Substitution(name="ewm")
@Appender(_doc_template)
def mean(self, *args, **kwargs):
"""
@@ -2365,38 +2502,44 @@ def mean(self, *args, **kwargs):
*args, **kwargs
Arguments and keyword arguments to be passed into func.
"""
- nv.validate_window_func('mean', args, kwargs)
- return self._apply('ewma', **kwargs)
+ nv.validate_window_func("mean", args, kwargs)
+ return self._apply("ewma", **kwargs)
- @Substitution(name='ewm')
+ @Substitution(name="ewm")
@Appender(_doc_template)
@Appender(_bias_template)
def std(self, bias=False, *args, **kwargs):
"""
Exponential weighted moving stddev.
"""
- nv.validate_window_func('std', args, kwargs)
+ nv.validate_window_func("std", args, kwargs)
return _zsqrt(self.var(bias=bias, **kwargs))
vol = std
- @Substitution(name='ewm')
+ @Substitution(name="ewm")
@Appender(_doc_template)
@Appender(_bias_template)
def var(self, bias=False, *args, **kwargs):
"""
Exponential weighted moving variance.
"""
- nv.validate_window_func('var', args, kwargs)
+ nv.validate_window_func("var", args, kwargs)
def f(arg):
- return libwindow.ewmcov(arg, arg, self.com, int(self.adjust),
- int(self.ignore_na), int(self.min_periods),
- int(bias))
+ return libwindow.ewmcov(
+ arg,
+ arg,
+ self.com,
+ int(self.adjust),
+ int(self.ignore_na),
+ int(self.min_periods),
+ int(bias),
+ )
return self._apply(f, **kwargs)
- @Substitution(name='ewm')
+ @Substitution(name="ewm")
@Appender(_doc_template)
@Appender(_pairwise_template)
def cov(self, other=None, pairwise=None, bias=False, **kwargs):
@@ -2412,16 +2555,22 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs):
def _get_cov(X, Y):
X = self._shallow_copy(X)
Y = self._shallow_copy(Y)
- cov = libwindow.ewmcov(X._prep_values(), Y._prep_values(),
- self.com, int(self.adjust),
- int(self.ignore_na), int(self.min_periods),
- int(bias))
+ cov = libwindow.ewmcov(
+ X._prep_values(),
+ Y._prep_values(),
+ self.com,
+ int(self.adjust),
+ int(self.ignore_na),
+ int(self.min_periods),
+ int(bias),
+ )
return X._wrap_result(cov)
- return _flex_binary_moment(self._selected_obj, other._selected_obj,
- _get_cov, pairwise=bool(pairwise))
+ return _flex_binary_moment(
+ self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise)
+ )
- @Substitution(name='ewm')
+ @Substitution(name="ewm")
@Appender(_doc_template)
@Appender(_pairwise_template)
def corr(self, other=None, pairwise=None, **kwargs):
@@ -2439,35 +2588,47 @@ def _get_corr(X, Y):
Y = self._shallow_copy(Y)
def _cov(x, y):
- return libwindow.ewmcov(x, y, self.com, int(self.adjust),
- int(self.ignore_na),
- int(self.min_periods),
- 1)
+ return libwindow.ewmcov(
+ x,
+ y,
+ self.com,
+ int(self.adjust),
+ int(self.ignore_na),
+ int(self.min_periods),
+ 1,
+ )
x_values = X._prep_values()
y_values = Y._prep_values()
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
cov = _cov(x_values, y_values)
x_var = _cov(x_values, x_values)
y_var = _cov(y_values, y_values)
corr = cov / _zsqrt(x_var * y_var)
return X._wrap_result(corr)
- return _flex_binary_moment(self._selected_obj, other._selected_obj,
- _get_corr, pairwise=bool(pairwise))
+ return _flex_binary_moment(
+ self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise)
+ )
+
# Helper Funcs
def _flex_binary_moment(arg1, arg2, f, pairwise=False):
- if not (isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) and
- isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame))):
- raise TypeError("arguments to moment function must be of type "
- "np.ndarray/Series/DataFrame")
-
- if (isinstance(arg1, (np.ndarray, ABCSeries)) and
- isinstance(arg2, (np.ndarray, ABCSeries))):
+ if not (
+ isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame))
+ and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame))
+ ):
+ raise TypeError(
+ "arguments to moment function must be of type "
+ "np.ndarray/Series/DataFrame"
+ )
+
+ if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance(
+ arg2, (np.ndarray, ABCSeries)
+ ):
X, Y = _prep_binary(arg1, arg2)
return f(X, Y)
@@ -2495,7 +2656,7 @@ def dataframe_from_int_dict(data, frame_template):
raise ValueError("'arg2' columns are not unique")
with warnings.catch_warnings(record=True):
warnings.simplefilter("ignore", RuntimeWarning)
- X, Y = arg1.align(arg2, join='outer')
+ X, Y = arg1.align(arg2, join="outer")
X = X + 0 * Y
Y = Y + 0 * X
@@ -2505,8 +2666,7 @@ def dataframe_from_int_dict(data, frame_template):
for col in res_columns:
if col in X and col in Y:
results[col] = f(X[col], Y[col])
- return DataFrame(results, index=X.index,
- columns=res_columns)
+ return DataFrame(results, index=X.index, columns=res_columns)
elif pairwise is True:
results = defaultdict(dict)
for i, k1 in enumerate(arg1.columns):
@@ -2515,8 +2675,9 @@ def dataframe_from_int_dict(data, frame_template):
# Symmetric case
results[i][j] = results[j][i]
else:
- results[i][j] = f(*_prep_binary(arg1.iloc[:, i],
- arg2.iloc[:, j]))
+ results[i][j] = f(
+ *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])
+ )
from pandas import MultiIndex, concat
@@ -2525,50 +2686,60 @@ def dataframe_from_int_dict(data, frame_template):
# construct result frame
result = concat(
- [concat([results[i][j]
- for j, c in enumerate(arg2.columns)],
- ignore_index=True)
- for i, c in enumerate(arg1.columns)],
+ [
+ concat(
+ [results[i][j] for j, c in enumerate(arg2.columns)],
+ ignore_index=True,
+ )
+ for i, c in enumerate(arg1.columns)
+ ],
ignore_index=True,
- axis=1)
+ axis=1,
+ )
result.columns = arg1.columns
# set the index and reorder
if arg2.columns.nlevels > 1:
result.index = MultiIndex.from_product(
- arg2.columns.levels + [result_index])
+ arg2.columns.levels + [result_index]
+ )
result = result.reorder_levels([2, 0, 1]).sort_index()
else:
result.index = MultiIndex.from_product(
- [range(len(arg2.columns)),
- range(len(result_index))])
+ [range(len(arg2.columns)), range(len(result_index))]
+ )
result = result.swaplevel(1, 0).sort_index()
result.index = MultiIndex.from_product(
- [result_index] + [arg2.columns])
+ [result_index] + [arg2.columns]
+ )
else:
# empty result
result = DataFrame(
- index=MultiIndex(levels=[arg1.index, arg2.columns],
- codes=[[], []]),
+ index=MultiIndex(
+ levels=[arg1.index, arg2.columns], codes=[[], []]
+ ),
columns=arg2.columns,
- dtype='float64')
+ dtype="float64",
+ )
# reset our index names to arg1 names
# reset our column names to arg2 names
# careful not to mutate the original names
- result.columns = result.columns.set_names(
- arg1.columns.names)
+ result.columns = result.columns.set_names(arg1.columns.names)
result.index = result.index.set_names(
- result_index.names + arg2.columns.names)
+ result_index.names + arg2.columns.names
+ )
return result
else:
raise ValueError("'pairwise' is not True/False")
else:
- results = {i: f(*_prep_binary(arg1.iloc[:, i], arg2))
- for i, col in enumerate(arg1.columns)}
+ results = {
+ i: f(*_prep_binary(arg1.iloc[:, i], arg2))
+ for i, col in enumerate(arg1.columns)
+ }
return dataframe_from_int_dict(results, arg1)
else:
@@ -2578,8 +2749,7 @@ def dataframe_from_int_dict(data, frame_template):
def _get_center_of_mass(comass, span, halflife, alpha):
valid_count = com.count_not_none(comass, span, halflife, alpha)
if valid_count > 1:
- raise ValueError("comass, span, halflife, and alpha "
- "are mutually exclusive")
+ raise ValueError("comass, span, halflife, and alpha " "are mutually exclusive")
# Convert to center of mass; domain checks ensure 0 < alpha <= 1
if comass is not None:
@@ -2588,7 +2758,7 @@ def _get_center_of_mass(comass, span, halflife, alpha):
elif span is not None:
if span < 1:
raise ValueError("span must satisfy: span >= 1")
- comass = (span - 1) / 2.
+ comass = (span - 1) / 2.0
elif halflife is not None:
if halflife <= 0:
raise ValueError("halflife must satisfy: halflife > 0")
@@ -2607,7 +2777,7 @@ def _get_center_of_mass(comass, span, halflife, alpha):
def _offset(window, center):
if not is_integer(window):
window = len(window)
- offset = (window - 1) / 2. if center else 0
+ offset = (window - 1) / 2.0 if center else 0
try:
return int(offset)
except TypeError:
@@ -2632,7 +2802,7 @@ def _use_window(minp, window):
def _zsqrt(x):
- with np.errstate(all='ignore'):
+ with np.errstate(all="ignore"):
result = np.sqrt(x)
mask = x < 0
@@ -2648,7 +2818,7 @@ def _zsqrt(x):
def _prep_binary(arg1, arg2):
if not isinstance(arg2, type(arg1)):
- raise Exception('Input arrays must be of the same type!')
+ raise Exception("Input arrays must be of the same type!")
# mask out values, this also makes a common index...
X = arg1 + 0 * arg2
@@ -2662,7 +2832,7 @@ def _prep_binary(arg1, arg2):
def rolling(obj, win_type=None, **kwds):
if not isinstance(obj, (ABCSeries, ABCDataFrame)):
- raise TypeError('invalid type: %s' % type(obj))
+ raise TypeError("invalid type: %s" % type(obj))
if win_type is not None:
return Window(obj, win_type=win_type, **kwds)
@@ -2675,7 +2845,7 @@ def rolling(obj, win_type=None, **kwds):
def expanding(obj, **kwds):
if not isinstance(obj, (ABCSeries, ABCDataFrame)):
- raise TypeError('invalid type: %s' % type(obj))
+ raise TypeError("invalid type: %s" % type(obj))
return Expanding(obj, **kwds)
@@ -2685,7 +2855,7 @@ def expanding(obj, **kwds):
def ewm(obj, **kwds):
if not isinstance(obj, (ABCSeries, ABCDataFrame)):
- raise TypeError('invalid type: %s' % type(obj))
+ raise TypeError("invalid type: %s" % type(obj))
return EWM(obj, **kwds)
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
index 3b8904f4c1ef6..3177937ac4ba1 100644
--- a/pandas/errors/__init__.py
+++ b/pandas/errors/__init__.py
@@ -174,17 +174,18 @@ class AbstractMethodError(NotImplementedError):
while keeping compatibility with Python 2 and Python 3.
"""
- def __init__(self, class_instance, methodtype='method'):
- types = {'method', 'classmethod', 'staticmethod', 'property'}
+ def __init__(self, class_instance, methodtype="method"):
+ types = {"method", "classmethod", "staticmethod", "property"}
if methodtype not in types:
- msg = 'methodtype must be one of {}, got {} instead.'.format(
- methodtype, types)
+ msg = "methodtype must be one of {}, got {} instead.".format(
+ methodtype, types
+ )
raise ValueError(msg)
self.methodtype = methodtype
self.class_instance = class_instance
def __str__(self):
- if self.methodtype == 'classmethod':
+ if self.methodtype == "classmethod":
name = self.class_instance.__name__
else:
name = self.class_instance.__class__.__name__
diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py
index e033d882a73f7..caa928731fb3a 100644
--- a/pandas/io/clipboard/__init__.py
+++ b/pandas/io/clipboard/__init__.py
@@ -23,15 +23,20 @@
This module does not work with PyGObject yet.
"""
-__version__ = '1.5.27'
+__version__ = "1.5.27"
import os
import platform
import subprocess
from .clipboards import (
- init_klipper_clipboard, init_no_clipboard, init_osx_clipboard,
- init_qt_clipboard, init_xclip_clipboard, init_xsel_clipboard)
+ init_klipper_clipboard,
+ init_no_clipboard,
+ init_osx_clipboard,
+ init_qt_clipboard,
+ init_xclip_clipboard,
+ init_xsel_clipboard,
+)
from .windows import init_windows_clipboard
# `import qtpy` sys.exit()s if DISPLAY is not in the environment.
@@ -42,20 +47,24 @@
def _executable_exists(name):
- return subprocess.call([CHECK_CMD, name],
- stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
+ return (
+ subprocess.call(
+ [CHECK_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+ )
+ == 0
+ )
def determine_clipboard():
# Determine the OS/platform and set
# the copy() and paste() functions accordingly.
- if 'cygwin' in platform.system().lower():
+ if "cygwin" in platform.system().lower():
# FIXME: pyperclip currently does not support Cygwin,
# see https://github.com/asweigart/pyperclip/issues/55
pass
- elif os.name == 'nt' or platform.system() == 'Windows':
+ elif os.name == "nt" or platform.system() == "Windows":
return init_windows_clipboard()
- if os.name == 'mac' or platform.system() == 'Darwin':
+ if os.name == "mac" or platform.system() == "Darwin":
return init_osx_clipboard()
if HAS_DISPLAY:
# Determine which command/module is installed, if any.
@@ -94,13 +103,15 @@ def determine_clipboard():
def set_clipboard(clipboard):
global copy, paste
- clipboard_types = {'osx': init_osx_clipboard,
- 'qt': init_qt_clipboard,
- 'xclip': init_xclip_clipboard,
- 'xsel': init_xsel_clipboard,
- 'klipper': init_klipper_clipboard,
- 'windows': init_windows_clipboard,
- 'no': init_no_clipboard}
+ clipboard_types = {
+ "osx": init_osx_clipboard,
+ "qt": init_qt_clipboard,
+ "xclip": init_xclip_clipboard,
+ "xsel": init_xsel_clipboard,
+ "klipper": init_klipper_clipboard,
+ "windows": init_windows_clipboard,
+ "no": init_no_clipboard,
+ }
copy, paste = clipboard_types[clipboard]()
diff --git a/pandas/io/clipboard/clipboards.py b/pandas/io/clipboard/clipboards.py
index 52abdeafb5ecc..cb4ed8ed549d0 100644
--- a/pandas/io/clipboard/clipboards.py
+++ b/pandas/io/clipboard/clipboards.py
@@ -9,15 +9,13 @@
def init_osx_clipboard():
def copy_osx(text):
- p = subprocess.Popen(['pbcopy', 'w'],
- stdin=subprocess.PIPE, close_fds=True)
- p.communicate(input=text.encode('utf-8'))
+ p = subprocess.Popen(["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True)
+ p.communicate(input=text.encode("utf-8"))
def paste_osx():
- p = subprocess.Popen(['pbpaste', 'r'],
- stdout=subprocess.PIPE, close_fds=True)
+ p = subprocess.Popen(["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True)
stdout, stderr = p.communicate()
- return stdout.decode('utf-8')
+ return stdout.decode("utf-8")
return copy_osx, paste_osx
@@ -51,30 +49,34 @@ def paste_qt():
def init_xclip_clipboard():
def copy_xclip(text):
- p = subprocess.Popen(['xclip', '-selection', 'c'],
- stdin=subprocess.PIPE, close_fds=True)
- p.communicate(input=text.encode('utf-8'))
+ p = subprocess.Popen(
+ ["xclip", "-selection", "c"], stdin=subprocess.PIPE, close_fds=True
+ )
+ p.communicate(input=text.encode("utf-8"))
def paste_xclip():
- p = subprocess.Popen(['xclip', '-selection', 'c', '-o'],
- stdout=subprocess.PIPE, close_fds=True)
+ p = subprocess.Popen(
+ ["xclip", "-selection", "c", "-o"], stdout=subprocess.PIPE, close_fds=True
+ )
stdout, stderr = p.communicate()
- return stdout.decode('utf-8')
+ return stdout.decode("utf-8")
return copy_xclip, paste_xclip
def init_xsel_clipboard():
def copy_xsel(text):
- p = subprocess.Popen(['xsel', '-b', '-i'],
- stdin=subprocess.PIPE, close_fds=True)
- p.communicate(input=text.encode('utf-8'))
+ p = subprocess.Popen(
+ ["xsel", "-b", "-i"], stdin=subprocess.PIPE, close_fds=True
+ )
+ p.communicate(input=text.encode("utf-8"))
def paste_xsel():
- p = subprocess.Popen(['xsel', '-b', '-o'],
- stdout=subprocess.PIPE, close_fds=True)
+ p = subprocess.Popen(
+ ["xsel", "-b", "-o"], stdout=subprocess.PIPE, close_fds=True
+ )
stdout, stderr = p.communicate()
- return stdout.decode('utf-8')
+ return stdout.decode("utf-8")
return copy_xsel, paste_xsel
@@ -82,25 +84,34 @@ def paste_xsel():
def init_klipper_clipboard():
def copy_klipper(text):
p = subprocess.Popen(
- ['qdbus', 'org.kde.klipper', '/klipper', 'setClipboardContents',
- text.encode('utf-8')],
- stdin=subprocess.PIPE, close_fds=True)
+ [
+ "qdbus",
+ "org.kde.klipper",
+ "/klipper",
+ "setClipboardContents",
+ text.encode("utf-8"),
+ ],
+ stdin=subprocess.PIPE,
+ close_fds=True,
+ )
p.communicate(input=None)
def paste_klipper():
p = subprocess.Popen(
- ['qdbus', 'org.kde.klipper', '/klipper', 'getClipboardContents'],
- stdout=subprocess.PIPE, close_fds=True)
+ ["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"],
+ stdout=subprocess.PIPE,
+ close_fds=True,
+ )
stdout, stderr = p.communicate()
# Workaround for https://bugs.kde.org/show_bug.cgi?id=342874
# TODO: https://github.com/asweigart/pyperclip/issues/43
- clipboardContents = stdout.decode('utf-8')
+ clipboardContents = stdout.decode("utf-8")
# even if blank, Klipper will append a newline at the end
assert len(clipboardContents) > 0
# make sure that newline is there
- assert clipboardContents.endswith('\n')
- if clipboardContents.endswith('\n'):
+ assert clipboardContents.endswith("\n")
+ if clipboardContents.endswith("\n"):
clipboardContents = clipboardContents[:-1]
return clipboardContents
@@ -109,7 +120,6 @@ def paste_klipper():
def init_no_clipboard():
class ClipboardUnavailable:
-
def __call__(self, *args, **kwargs):
raise PyperclipException(EXCEPT_MSG)
diff --git a/pandas/io/clipboard/exceptions.py b/pandas/io/clipboard/exceptions.py
index 6276b06b9d7fe..eaf5578b5cd1b 100644
--- a/pandas/io/clipboard/exceptions.py
+++ b/pandas/io/clipboard/exceptions.py
@@ -6,7 +6,6 @@ class PyperclipException(RuntimeError):
class PyperclipWindowsException(PyperclipException):
-
def __init__(self, message):
message += " ({err})".format(err=ctypes.WinError())
super().__init__(message)
diff --git a/pandas/io/clipboard/windows.py b/pandas/io/clipboard/windows.py
index 72abc72966342..2935dfdc2ae19 100644
--- a/pandas/io/clipboard/windows.py
+++ b/pandas/io/clipboard/windows.py
@@ -10,7 +10,6 @@
class CheckedCall:
-
def __init__(self, f):
super().__setattr__("f", f)
@@ -25,15 +24,38 @@ def __setattr__(self, key, value):
def init_windows_clipboard():
- from ctypes.wintypes import (HGLOBAL, LPVOID, DWORD, LPCSTR, INT, HWND,
- HINSTANCE, HMENU, BOOL, UINT, HANDLE)
+ from ctypes.wintypes import (
+ HGLOBAL,
+ LPVOID,
+ DWORD,
+ LPCSTR,
+ INT,
+ HWND,
+ HINSTANCE,
+ HMENU,
+ BOOL,
+ UINT,
+ HANDLE,
+ )
windll = ctypes.windll
- msvcrt = ctypes.CDLL('msvcrt')
+ msvcrt = ctypes.CDLL("msvcrt")
safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA)
- safeCreateWindowExA.argtypes = [DWORD, LPCSTR, LPCSTR, DWORD, INT, INT,
- INT, INT, HWND, HMENU, HINSTANCE, LPVOID]
+ safeCreateWindowExA.argtypes = [
+ DWORD,
+ LPCSTR,
+ LPCSTR,
+ DWORD,
+ INT,
+ INT,
+ INT,
+ INT,
+ HWND,
+ HMENU,
+ HINSTANCE,
+ LPVOID,
+ ]
safeCreateWindowExA.restype = HWND
safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow)
@@ -86,8 +108,9 @@ def window():
"""
# we really just need the hwnd, so setting "STATIC"
# as predefined lpClass is just fine.
- hwnd = safeCreateWindowExA(0, b"STATIC", None, 0, 0, 0, 0, 0,
- None, None, None, None)
+ hwnd = safeCreateWindowExA(
+ 0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None
+ )
try:
yield hwnd
finally:
@@ -135,12 +158,14 @@ def copy_windows(text):
# the object must have been allocated using the
# function with the GMEM_MOVEABLE flag.
count = wcslen(text) + 1
- handle = safeGlobalAlloc(GMEM_MOVEABLE,
- count * sizeof(c_wchar))
+ handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar))
locked_handle = safeGlobalLock(handle)
- ctypes.memmove(c_wchar_p(locked_handle), c_wchar_p(text),
- count * sizeof(c_wchar))
+ ctypes.memmove(
+ c_wchar_p(locked_handle),
+ c_wchar_p(text),
+ count * sizeof(c_wchar),
+ )
safeGlobalUnlock(handle)
safeSetClipboardData(CF_UNICODETEXT, handle)
diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py
index dc30285895dd5..0006824f09fe7 100644
--- a/pandas/io/clipboards.py
+++ b/pandas/io/clipboards.py
@@ -7,7 +7,7 @@
from pandas import get_option, option_context
-def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover
+def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover
r"""
Read text from clipboard and pass to read_csv. See read_csv for the
full argument list
@@ -22,22 +22,21 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover
-------
parsed : DataFrame
"""
- encoding = kwargs.pop('encoding', 'utf-8')
+ encoding = kwargs.pop("encoding", "utf-8")
# only utf-8 is valid for passed value because that's what clipboard
# supports
- if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
- raise NotImplementedError(
- 'reading from clipboard only supports utf-8 encoding')
+ if encoding is not None and encoding.lower().replace("-", "") != "utf8":
+ raise NotImplementedError("reading from clipboard only supports utf-8 encoding")
from pandas.io.clipboard import clipboard_get
from pandas.io.parsers import read_csv
+
text = clipboard_get()
# Try to decode (if needed, as "text" might already be a string here).
try:
- text = text.decode(kwargs.get('encoding')
- or get_option('display.encoding'))
+ text = text.decode(kwargs.get("encoding") or get_option("display.encoding"))
except AttributeError:
pass
@@ -45,7 +44,7 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover
# inspect no more then the 10 first lines, if they
# all contain an equal number (>0) of tabs, infer
# that this came from excel and set 'sep' accordingly
- lines = text[:10000].split('\n')[:-1][:10]
+ lines = text[:10000].split("\n")[:-1][:10]
# Need to remove leading white space, since read_csv
# accepts:
@@ -53,21 +52,23 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover
# 0 1 2
# 1 3 4
- counts = {x.lstrip().count('\t') for x in lines}
+ counts = {x.lstrip().count("\t") for x in lines}
if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
- sep = '\t'
+ sep = "\t"
# Edge case where sep is specified to be None, return to default
- if sep is None and kwargs.get('delim_whitespace') is None:
- sep = r'\s+'
+ if sep is None and kwargs.get("delim_whitespace") is None:
+ sep = r"\s+"
# Regex separator currently only works with python engine.
# Default to python if separator is multi-character (regex)
- if len(sep) > 1 and kwargs.get('engine') is None:
- kwargs['engine'] = 'python'
- elif len(sep) > 1 and kwargs.get('engine') == 'c':
- warnings.warn('read_clipboard with regex separator does not work'
- ' properly with c engine')
+ if len(sep) > 1 and kwargs.get("engine") is None:
+ kwargs["engine"] = "python"
+ elif len(sep) > 1 and kwargs.get("engine") == "c":
+ warnings.warn(
+ "read_clipboard with regex separator does not work"
+ " properly with c engine"
+ )
return read_csv(StringIO(text), sep=sep, **kwargs)
@@ -95,37 +96,39 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover
- Windows:
- OS X:
"""
- encoding = kwargs.pop('encoding', 'utf-8')
+ encoding = kwargs.pop("encoding", "utf-8")
# testing if an invalid encoding is passed to clipboard
- if encoding is not None and encoding.lower().replace('-', '') != 'utf8':
- raise ValueError('clipboard only supports utf-8 encoding')
+ if encoding is not None and encoding.lower().replace("-", "") != "utf8":
+ raise ValueError("clipboard only supports utf-8 encoding")
from pandas.io.clipboard import clipboard_set
+
if excel is None:
excel = True
if excel:
try:
if sep is None:
- sep = '\t'
+ sep = "\t"
buf = StringIO()
# clipboard_set (pyperclip) expects unicode
- obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs)
+ obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs)
text = buf.getvalue()
clipboard_set(text)
return
except TypeError:
- warnings.warn('to_clipboard in excel mode requires a single '
- 'character separator.')
+ warnings.warn(
+ "to_clipboard in excel mode requires a single " "character separator."
+ )
elif sep is not None:
- warnings.warn('to_clipboard with excel=False ignores the sep argument')
+ warnings.warn("to_clipboard with excel=False ignores the sep argument")
if isinstance(obj, ABCDataFrame):
# str(df) has various unhelpful defaults, like truncation
- with option_context('display.max_colwidth', 999999):
+ with option_context("display.max_colwidth", 999999):
objstr = obj.to_string(**kwargs)
else:
objstr = str(obj)
diff --git a/pandas/io/common.py b/pandas/io/common.py
index 34635ebf64ad6..9a9620e2d0663 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -12,14 +12,23 @@
import pathlib
from urllib.error import URLError # noqa
from urllib.parse import ( # noqa
- urlencode, urljoin, urlparse as parse_url, uses_netloc, uses_params,
- uses_relative)
+ urlencode,
+ urljoin,
+ urlparse as parse_url,
+ uses_netloc,
+ uses_params,
+ uses_relative,
+)
from urllib.request import pathname2url, urlopen
import zipfile
from pandas.errors import ( # noqa
- AbstractMethodError, DtypeWarning, EmptyDataError, ParserError,
- ParserWarning)
+ AbstractMethodError,
+ DtypeWarning,
+ EmptyDataError,
+ ParserError,
+ ParserWarning,
+)
from pandas.core.dtypes.common import is_file_like
@@ -29,13 +38,29 @@
# common NA values
# no longer excluding inf representations
# '1.#INF','-1.#INF', '1.#INF000000',
-_NA_VALUES = {'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A',
- 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan',
- '-nan', ''}
+_NA_VALUES = {
+ "-1.#IND",
+ "1.#QNAN",
+ "1.#IND",
+ "-1.#QNAN",
+ "#N/A N/A",
+ "#N/A",
+ "N/A",
+ "n/a",
+ "NA",
+ "#NA",
+ "NULL",
+ "null",
+ "NaN",
+ "-NaN",
+ "nan",
+ "-nan",
+ "",
+}
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
-_VALID_URLS.discard('')
+_VALID_URLS.discard("")
class BaseIterator:
@@ -88,10 +113,12 @@ def _expand_user(filepath_or_buffer):
def _validate_header_arg(header):
if isinstance(header, bool):
- raise TypeError("Passing a bool to header is invalid. "
- "Use header=None for no header or "
- "header=int or list-like of ints to specify "
- "the row(s) making up the column names")
+ raise TypeError(
+ "Passing a bool to header is invalid. "
+ "Use header=None for no header or "
+ "header=int or list-like of ints to specify "
+ "the row(s) making up the column names"
+ )
def _stringify_path(filepath_or_buffer):
@@ -116,7 +143,7 @@ def _stringify_path(filepath_or_buffer):
Any other object is passed through unchanged, which includes bytes,
strings, buffers, or anything else that's not even path-like.
"""
- if hasattr(filepath_or_buffer, '__fspath__'):
+ if hasattr(filepath_or_buffer, "__fspath__"):
return filepath_or_buffer.__fspath__()
elif isinstance(filepath_or_buffer, pathlib.Path):
return str(filepath_or_buffer)
@@ -126,7 +153,7 @@ def _stringify_path(filepath_or_buffer):
def is_s3_url(url):
"""Check for an s3, s3n, or s3a url"""
try:
- return parse_url(url).scheme in ['s3', 's3n', 's3a']
+ return parse_url(url).scheme in ["s3", "s3n", "s3a"]
except Exception:
return False
@@ -134,13 +161,14 @@ def is_s3_url(url):
def is_gcs_url(url):
"""Check for a gcs url"""
try:
- return parse_url(url).scheme in ['gcs', 'gs']
+ return parse_url(url).scheme in ["gcs", "gs"]
except Exception:
return False
-def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
- compression=None, mode=None):
+def get_filepath_or_buffer(
+ filepath_or_buffer, encoding=None, compression=None, mode=None
+):
"""
If the filepath_or_buffer is a url, translate and return the buffer.
Otherwise passthrough.
@@ -164,27 +192,27 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
if _is_url(filepath_or_buffer):
req = urlopen(filepath_or_buffer)
- content_encoding = req.headers.get('Content-Encoding', None)
- if content_encoding == 'gzip':
+ content_encoding = req.headers.get("Content-Encoding", None)
+ if content_encoding == "gzip":
# Override compression based on Content-Encoding header
- compression = 'gzip'
+ compression = "gzip"
reader = BytesIO(req.read())
req.close()
return reader, encoding, compression, True
if is_s3_url(filepath_or_buffer):
from pandas.io import s3
- return s3.get_filepath_or_buffer(filepath_or_buffer,
- encoding=encoding,
- compression=compression,
- mode=mode)
+
+ return s3.get_filepath_or_buffer(
+ filepath_or_buffer, encoding=encoding, compression=compression, mode=mode
+ )
if is_gcs_url(filepath_or_buffer):
from pandas.io import gcs
- return gcs.get_filepath_or_buffer(filepath_or_buffer,
- encoding=encoding,
- compression=compression,
- mode=mode)
+
+ return gcs.get_filepath_or_buffer(
+ filepath_or_buffer, encoding=encoding, compression=compression, mode=mode
+ )
if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
return _expand_user(filepath_or_buffer), None, compression, False
@@ -208,15 +236,10 @@ def file_path_to_url(path):
-------
a valid FILE URL
"""
- return urljoin('file:', pathname2url(path))
+ return urljoin("file:", pathname2url(path))
-_compression_to_extension = {
- 'gzip': '.gz',
- 'bz2': '.bz2',
- 'zip': '.zip',
- 'xz': '.xz',
-}
+_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"}
def _infer_compression(filepath_or_buffer, compression):
@@ -250,7 +273,7 @@ def _infer_compression(filepath_or_buffer, compression):
return None
# Infer compression
- if compression == 'infer':
+ if compression == "infer":
# Convert all path types (e.g. pathlib.Path) to strings
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, str):
@@ -267,14 +290,15 @@ def _infer_compression(filepath_or_buffer, compression):
if compression in _compression_to_extension:
return compression
- msg = 'Unrecognized compression type: {}'.format(compression)
- valid = ['infer', None] + sorted(_compression_to_extension)
- msg += '\nValid compression types are {}'.format(valid)
+ msg = "Unrecognized compression type: {}".format(compression)
+ valid = ["infer", None] + sorted(_compression_to_extension)
+ msg += "\nValid compression types are {}".format(valid)
raise ValueError(msg)
-def _get_handle(path_or_buf, mode, encoding=None, compression=None,
- memory_map=False, is_text=True):
+def _get_handle(
+ path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True
+):
"""
Get file handle for given path/buffer and mode.
@@ -304,6 +328,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
"""
try:
from s3fs import S3File
+
need_text_wrapping = (BytesIO, S3File)
except ImportError:
need_text_wrapping = (BytesIO,)
@@ -321,45 +346,47 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
if compression:
# GZ Compression
- if compression == 'gzip':
+ if compression == "gzip":
if is_path:
f = gzip.open(path_or_buf, mode)
else:
f = gzip.GzipFile(fileobj=path_or_buf)
# BZ Compression
- elif compression == 'bz2':
+ elif compression == "bz2":
if is_path:
f = bz2.BZ2File(path_or_buf, mode)
else:
f = bz2.BZ2File(path_or_buf)
# ZIP Compression
- elif compression == 'zip':
+ elif compression == "zip":
zf = BytesZipFile(path_or_buf, mode)
# Ensure the container is closed as well.
handles.append(zf)
- if zf.mode == 'w':
+ if zf.mode == "w":
f = zf
- elif zf.mode == 'r':
+ elif zf.mode == "r":
zip_names = zf.namelist()
if len(zip_names) == 1:
f = zf.open(zip_names.pop())
elif len(zip_names) == 0:
- raise ValueError('Zero files found in ZIP file {}'
- .format(path_or_buf))
+ raise ValueError(
+ "Zero files found in ZIP file {}".format(path_or_buf)
+ )
else:
- raise ValueError('Multiple files found in ZIP file.'
- ' Only one file per ZIP: {}'
- .format(zip_names))
+ raise ValueError(
+ "Multiple files found in ZIP file."
+ " Only one file per ZIP: {}".format(zip_names)
+ )
# XZ Compression
- elif compression == 'xz':
+ elif compression == "xz":
f = lzma.LZMAFile(path_or_buf, mode)
# Unrecognized Compression
else:
- msg = 'Unrecognized compression type: {}'.format(compression)
+ msg = "Unrecognized compression type: {}".format(compression)
raise ValueError(msg)
handles.append(f)
@@ -370,7 +397,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
f = open(path_or_buf, mode, encoding=encoding, newline="")
elif is_text:
# No explicit encoding
- f = open(path_or_buf, mode, errors='replace', newline="")
+ f = open(path_or_buf, mode, errors="replace", newline="")
else:
# Binary mode
f = open(path_or_buf, mode)
@@ -379,10 +406,11 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
# Convert BytesIO or file objects passed with an encoding
if is_text and (compression or isinstance(f, need_text_wrapping)):
from io import TextIOWrapper
- f = TextIOWrapper(f, encoding=encoding, newline='')
+
+ f = TextIOWrapper(f, encoding=encoding, newline="")
handles.append(f)
- if memory_map and hasattr(f, 'fileno'):
+ if memory_map and hasattr(f, "fileno"):
try:
g = MMapWrapper(f)
f.close()
@@ -405,10 +433,11 @@ class BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore
BytesIO provides attributes of file-like object and ZipFile.writestr writes
bytes strings into a member of the archive.
"""
+
# GH 17778
def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
- if mode in ['wb', 'rb']:
- mode = mode.replace('b', '')
+ if mode in ["wb", "rb"]:
+ mode = mode.replace("b", "")
super().__init__(file, mode, compression, **kwargs)
def write(self, data):
@@ -446,12 +475,12 @@ def __next__(self):
# readline returns bytes, not str, but Python's CSV reader
# expects str, so convert the output to str before continuing
- newline = newline.decode('utf-8')
+ newline = newline.decode("utf-8")
# mmap doesn't raise if reading past the allocated
# data but instead returns an empty string, so raise
# if that is returned
- if newline == '':
+ if newline == "":
raise StopIteration
return newline
diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py
index 75f353f28549c..ab64bc14344f1 100644
--- a/pandas/io/date_converters.py
+++ b/pandas/io/date_converters.py
@@ -17,17 +17,16 @@ def parse_date_fields(year_col, month_col, day_col):
return parsing.try_parse_year_month_day(year_col, month_col, day_col)
-def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col,
- second_col):
+def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_col):
year_col = _maybe_cast(year_col)
month_col = _maybe_cast(month_col)
day_col = _maybe_cast(day_col)
hour_col = _maybe_cast(hour_col)
minute_col = _maybe_cast(minute_col)
second_col = _maybe_cast(second_col)
- return parsing.try_parse_datetime_components(year_col, month_col, day_col,
- hour_col, minute_col,
- second_col)
+ return parsing.try_parse_datetime_components(
+ year_col, month_col, day_col, hour_col, minute_col, second_col
+ )
def generic_parser(parse_func, *cols):
@@ -57,7 +56,9 @@ def _check_columns(cols):
for i, n in enumerate(map(len, tail)):
if n != N:
- raise AssertionError('All columns must have the same length: {0}; '
- 'column {1} has length {2}'.format(N, i, n))
+ raise AssertionError(
+ "All columns must have the same length: {0}; "
+ "column {1} has length {2}".format(N, i, n)
+ )
return N
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index d10a40541bb6c..fae8f4203e9a0 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -11,21 +11,29 @@
from pandas.errors import EmptyDataError
from pandas.util._decorators import Appender, deprecate_kwarg
-from pandas.core.dtypes.common import (
- is_bool, is_float, is_integer, is_list_like)
+from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like
from pandas.core.frame import DataFrame
from pandas.io.common import (
- _NA_VALUES, _is_url, _stringify_path, _validate_header_arg,
- get_filepath_or_buffer)
+ _NA_VALUES,
+ _is_url,
+ _stringify_path,
+ _validate_header_arg,
+ get_filepath_or_buffer,
+)
from pandas.io.excel._util import (
- _fill_mi_header, _get_default_writer, _maybe_convert_usecols,
- _pop_header_name, get_writer)
+ _fill_mi_header,
+ _get_default_writer,
+ _maybe_convert_usecols,
+ _pop_header_name,
+ get_writer,
+)
from pandas.io.formats.printing import pprint_thing
from pandas.io.parsers import TextParser
-_read_excel_doc = """
+_read_excel_doc = (
+ """
Read an Excel file into a pandas DataFrame.
Support both `xls` and `xlsx` file extensions from a local filesystem or URL.
@@ -124,8 +132,9 @@
na_values : scalar, str, list-like, or dict, default None
Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values. By default the following values are interpreted
- as NaN: '""" + fill("', '".join(
- sorted(_NA_VALUES)), 70, subsequent_indent=" ") + """'.
+ as NaN: '"""
+ + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ")
+ + """'.
keep_default_na : bool, default True
If na_values are specified and keep_default_na is False the default NaN
values are overridden, otherwise they're appended to.
@@ -251,47 +260,53 @@
1 string2 2.0
2 None NaN
"""
+)
@Appender(_read_excel_doc)
@deprecate_kwarg("skip_footer", "skipfooter")
-def read_excel(io,
- sheet_name=0,
- header=0,
- names=None,
- index_col=None,
- usecols=None,
- squeeze=False,
- dtype=None,
- engine=None,
- converters=None,
- true_values=None,
- false_values=None,
- skiprows=None,
- nrows=None,
- na_values=None,
- keep_default_na=True,
- verbose=False,
- parse_dates=False,
- date_parser=None,
- thousands=None,
- comment=None,
- skip_footer=0,
- skipfooter=0,
- convert_float=True,
- mangle_dupe_cols=True,
- **kwds):
-
- for arg in ('sheet', 'sheetname', 'parse_cols'):
+def read_excel(
+ io,
+ sheet_name=0,
+ header=0,
+ names=None,
+ index_col=None,
+ usecols=None,
+ squeeze=False,
+ dtype=None,
+ engine=None,
+ converters=None,
+ true_values=None,
+ false_values=None,
+ skiprows=None,
+ nrows=None,
+ na_values=None,
+ keep_default_na=True,
+ verbose=False,
+ parse_dates=False,
+ date_parser=None,
+ thousands=None,
+ comment=None,
+ skip_footer=0,
+ skipfooter=0,
+ convert_float=True,
+ mangle_dupe_cols=True,
+ **kwds
+):
+
+ for arg in ("sheet", "sheetname", "parse_cols"):
if arg in kwds:
- raise TypeError("read_excel() got an unexpected keyword argument "
- "`{}`".format(arg))
+ raise TypeError(
+ "read_excel() got an unexpected keyword argument " "`{}`".format(arg)
+ )
if not isinstance(io, ExcelFile):
io = ExcelFile(io, engine=engine)
elif engine and engine != io.engine:
- raise ValueError("Engine should not be specified when passing "
- "an ExcelFile - ExcelFile already has the engine set")
+ raise ValueError(
+ "Engine should not be specified when passing "
+ "an ExcelFile - ExcelFile already has the engine set"
+ )
return io.parse(
sheet_name=sheet_name,
@@ -316,19 +331,17 @@ def read_excel(io,
skipfooter=skipfooter,
convert_float=convert_float,
mangle_dupe_cols=mangle_dupe_cols,
- **kwds)
+ **kwds
+ )
class _BaseExcelReader(metaclass=abc.ABCMeta):
-
def __init__(self, filepath_or_buffer):
# If filepath_or_buffer is a url, load the data into a BytesIO
if _is_url(filepath_or_buffer):
filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
- elif not isinstance(filepath_or_buffer,
- (ExcelFile, self._workbook_class)):
- filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
- filepath_or_buffer)
+ elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
+ filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer)
if isinstance(filepath_or_buffer, self._workbook_class):
self.book = filepath_or_buffer
@@ -339,8 +352,9 @@ def __init__(self, filepath_or_buffer):
elif isinstance(filepath_or_buffer, str):
self.book = self.load_workbook(filepath_or_buffer)
else:
- raise ValueError('Must explicitly set engine if not passing in'
- ' buffer or path for io.')
+ raise ValueError(
+ "Must explicitly set engine if not passing in" " buffer or path for io."
+ )
@property
@abc.abstractmethod
@@ -368,28 +382,30 @@ def get_sheet_by_index(self, index):
def get_sheet_data(self, sheet, convert_float):
pass
- def parse(self,
- sheet_name=0,
- header=0,
- names=None,
- index_col=None,
- usecols=None,
- squeeze=False,
- dtype=None,
- true_values=None,
- false_values=None,
- skiprows=None,
- nrows=None,
- na_values=None,
- verbose=False,
- parse_dates=False,
- date_parser=None,
- thousands=None,
- comment=None,
- skipfooter=0,
- convert_float=True,
- mangle_dupe_cols=True,
- **kwds):
+ def parse(
+ self,
+ sheet_name=0,
+ header=0,
+ names=None,
+ index_col=None,
+ usecols=None,
+ squeeze=False,
+ dtype=None,
+ true_values=None,
+ false_values=None,
+ skiprows=None,
+ nrows=None,
+ na_values=None,
+ verbose=False,
+ parse_dates=False,
+ date_parser=None,
+ thousands=None,
+ comment=None,
+ skipfooter=0,
+ convert_float=True,
+ mangle_dupe_cols=True,
+ **kwds
+ ):
_validate_header_arg(header)
@@ -439,8 +455,7 @@ def parse(self,
if is_integer(skiprows):
row += skiprows
- data[row], control_row = _fill_mi_header(data[row],
- control_row)
+ data[row], control_row = _fill_mi_header(data[row], control_row)
if index_col is not None:
header_name, _ = _pop_header_name(data[row], index_col)
@@ -460,7 +475,7 @@ def parse(self,
last = data[offset][col]
for row in range(offset + 1, len(data)):
- if data[row][col] == '' or data[row][col] is None:
+ if data[row][col] == "" or data[row][col] is None:
data[row][col] = last
else:
last = data[row][col]
@@ -469,33 +484,36 @@ def parse(self,
# GH 12292 : error when read one empty column from excel file
try:
- parser = TextParser(data,
- names=names,
- header=header,
- index_col=index_col,
- has_index_names=has_index_names,
- squeeze=squeeze,
- dtype=dtype,
- true_values=true_values,
- false_values=false_values,
- skiprows=skiprows,
- nrows=nrows,
- na_values=na_values,
- parse_dates=parse_dates,
- date_parser=date_parser,
- thousands=thousands,
- comment=comment,
- skipfooter=skipfooter,
- usecols=usecols,
- mangle_dupe_cols=mangle_dupe_cols,
- **kwds)
+ parser = TextParser(
+ data,
+ names=names,
+ header=header,
+ index_col=index_col,
+ has_index_names=has_index_names,
+ squeeze=squeeze,
+ dtype=dtype,
+ true_values=true_values,
+ false_values=false_values,
+ skiprows=skiprows,
+ nrows=nrows,
+ na_values=na_values,
+ parse_dates=parse_dates,
+ date_parser=date_parser,
+ thousands=thousands,
+ comment=comment,
+ skipfooter=skipfooter,
+ usecols=usecols,
+ mangle_dupe_cols=mangle_dupe_cols,
+ **kwds
+ )
output[asheetname] = parser.read(nrows=nrows)
if not squeeze or isinstance(output[asheetname], DataFrame):
if header_names:
output[asheetname].columns = output[
- asheetname].columns.set_names(header_names)
+ asheetname
+ ].columns.set_names(header_names)
except EmptyDataError:
# No Data, return an empty DataFrame
@@ -570,6 +588,7 @@ class ExcelWriter(metaclass=abc.ABCMeta):
>>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer:
... df.to_excel(writer, sheet_name='Sheet3')
"""
+
# Defining an ExcelWriter implementation (see abstract methods for more...)
# - Mandatory
@@ -595,21 +614,18 @@ def __new__(cls, path, engine=None, **kwargs):
# only switch class if generic(ExcelWriter)
if cls is ExcelWriter:
- if engine is None or (isinstance(engine, str) and
- engine == 'auto'):
+ if engine is None or (isinstance(engine, str) and engine == "auto"):
if isinstance(path, str):
ext = os.path.splitext(path)[-1][1:]
else:
- ext = 'xlsx'
+ ext = "xlsx"
try:
- engine = config.get_option('io.excel.{ext}.writer'
- .format(ext=ext))
- if engine == 'auto':
+ engine = config.get_option("io.excel.{ext}.writer".format(ext=ext))
+ if engine == "auto":
engine = _get_default_writer(ext)
except KeyError:
- raise ValueError("No engine for filetype: '{ext}'"
- .format(ext=ext))
+ raise ValueError("No engine for filetype: '{ext}'".format(ext=ext))
cls = get_writer(engine)
return object.__new__(cls)
@@ -632,8 +648,9 @@ def engine(self):
pass
@abc.abstractmethod
- def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
- freeze_panes=None):
+ def write_cells(
+ self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
+ ):
"""
Write given formatted cells into Excel an excel sheet
@@ -657,14 +674,20 @@ def save(self):
"""
pass
- def __init__(self, path, engine=None,
- date_format=None, datetime_format=None, mode='w',
- **engine_kwargs):
+ def __init__(
+ self,
+ path,
+ engine=None,
+ date_format=None,
+ datetime_format=None,
+ mode="w",
+ **engine_kwargs
+ ):
# validate that this engine can handle the extension
if isinstance(path, str):
ext = os.path.splitext(path)[-1]
else:
- ext = 'xls' if engine == 'xlwt' else 'xlsx'
+ ext = "xls" if engine == "xlwt" else "xlsx"
self.check_extension(ext)
@@ -673,11 +696,11 @@ def __init__(self, path, engine=None,
self.cur_sheet = None
if date_format is None:
- self.date_format = 'YYYY-MM-DD'
+ self.date_format = "YYYY-MM-DD"
else:
self.date_format = date_format
if datetime_format is None:
- self.datetime_format = 'YYYY-MM-DD HH:MM:SS'
+ self.datetime_format = "YYYY-MM-DD HH:MM:SS"
else:
self.datetime_format = datetime_format
@@ -690,8 +713,9 @@ def _get_sheet_name(self, sheet_name):
if sheet_name is None:
sheet_name = self.cur_sheet
if sheet_name is None: # pragma: no cover
- raise ValueError('Must pass explicit sheet_name or set '
- 'cur_sheet property')
+ raise ValueError(
+ "Must pass explicit sheet_name or set " "cur_sheet property"
+ )
return sheet_name
def _value_with_fmt(self, val):
@@ -721,7 +745,7 @@ def _value_with_fmt(self, val):
fmt = self.date_format
elif isinstance(val, timedelta):
val = val.total_seconds() / float(86400)
- fmt = '0'
+ fmt = "0"
else:
val = str(val)
@@ -731,12 +755,12 @@ def _value_with_fmt(self, val):
def check_extension(cls, ext):
"""checks that path's extension against the Writer's supported
extensions. If it isn't supported, raises UnsupportedFiletypeError."""
- if ext.startswith('.'):
+ if ext.startswith("."):
ext = ext[1:]
if not any(ext in extension for extension in cls.supported_extensions):
- msg = ("Invalid extension for engine '{engine}': '{ext}'"
- .format(engine=pprint_thing(cls.engine),
- ext=pprint_thing(ext)))
+ msg = "Invalid extension for engine '{engine}': '{ext}'".format(
+ engine=pprint_thing(cls.engine), ext=pprint_thing(ext)
+ )
raise ValueError(msg)
else:
return True
@@ -772,15 +796,11 @@ class ExcelFile:
from pandas.io.excel._openpyxl import _OpenpyxlReader
from pandas.io.excel._xlrd import _XlrdReader
- _engines = {
- 'xlrd': _XlrdReader,
- 'openpyxl': _OpenpyxlReader,
- 'odf': _ODFReader,
- }
+ _engines = {"xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader}
def __init__(self, io, engine=None):
if engine is None:
- engine = 'xlrd'
+ engine = "xlrd"
if engine not in self._engines:
raise ValueError("Unknown engine: {engine}".format(engine=engine))
@@ -795,27 +815,29 @@ def __init__(self, io, engine=None):
def __fspath__(self):
return self._io
- def parse(self,
- sheet_name=0,
- header=0,
- names=None,
- index_col=None,
- usecols=None,
- squeeze=False,
- converters=None,
- true_values=None,
- false_values=None,
- skiprows=None,
- nrows=None,
- na_values=None,
- parse_dates=False,
- date_parser=None,
- thousands=None,
- comment=None,
- skipfooter=0,
- convert_float=True,
- mangle_dupe_cols=True,
- **kwds):
+ def parse(
+ self,
+ sheet_name=0,
+ header=0,
+ names=None,
+ index_col=None,
+ usecols=None,
+ squeeze=False,
+ converters=None,
+ true_values=None,
+ false_values=None,
+ skiprows=None,
+ nrows=None,
+ na_values=None,
+ parse_dates=False,
+ date_parser=None,
+ thousands=None,
+ comment=None,
+ skipfooter=0,
+ convert_float=True,
+ mangle_dupe_cols=True,
+ **kwds
+ ):
"""
Parse specified sheet(s) into a DataFrame
@@ -827,30 +849,33 @@ def parse(self,
DataFrame or dict of DataFrames
DataFrame from the passed in Excel file.
"""
- if 'chunksize' in kwds:
- raise NotImplementedError("chunksize keyword of read_excel "
- "is not implemented")
-
- return self._reader.parse(sheet_name=sheet_name,
- header=header,
- names=names,
- index_col=index_col,
- usecols=usecols,
- squeeze=squeeze,
- converters=converters,
- true_values=true_values,
- false_values=false_values,
- skiprows=skiprows,
- nrows=nrows,
- na_values=na_values,
- parse_dates=parse_dates,
- date_parser=date_parser,
- thousands=thousands,
- comment=comment,
- skipfooter=skipfooter,
- convert_float=convert_float,
- mangle_dupe_cols=mangle_dupe_cols,
- **kwds)
+ if "chunksize" in kwds:
+ raise NotImplementedError(
+ "chunksize keyword of read_excel " "is not implemented"
+ )
+
+ return self._reader.parse(
+ sheet_name=sheet_name,
+ header=header,
+ names=names,
+ index_col=index_col,
+ usecols=usecols,
+ squeeze=squeeze,
+ converters=converters,
+ true_values=true_values,
+ false_values=false_values,
+ skiprows=skiprows,
+ nrows=nrows,
+ na_values=na_values,
+ parse_dates=parse_dates,
+ date_parser=date_parser,
+ thousands=thousands,
+ comment=comment,
+ skipfooter=skipfooter,
+ convert_float=convert_float,
+ mangle_dupe_cols=mangle_dupe_cols,
+ **kwds
+ )
@property
def book(self):
@@ -862,7 +887,7 @@ def sheet_names(self):
def close(self):
"""close io if necessary"""
- if hasattr(self.io, 'close'):
+ if hasattr(self.io, "close"):
self.io.close()
def __enter__(self):
diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
index c820c1497c3c9..3be36663bac79 100644
--- a/pandas/io/excel/_odfreader.py
+++ b/pandas/io/excel/_odfreader.py
@@ -16,6 +16,7 @@ class _ODFReader(_BaseExcelReader):
filepath_or_buffer: string, path to be parsed or
an open readable stream.
"""
+
def __init__(self, filepath_or_buffer: FilePathOrBuffer):
import_optional_dependency("odf")
super().__init__(filepath_or_buffer)
@@ -23,16 +24,18 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer):
@property
def _workbook_class(self):
from odf.opendocument import OpenDocument
+
return OpenDocument
def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
from odf.opendocument import load
+
return load(filepath_or_buffer)
@property
def empty_value(self) -> str:
"""Property for compat with other readers."""
- return ''
+ return ""
@property
def sheet_names(self) -> List[str]:
@@ -44,6 +47,7 @@ def sheet_names(self) -> List[str]:
def get_sheet_by_index(self, index: int):
from odf.table import Table
+
tables = self.book.getElementsByType(Table)
return tables[index]
@@ -74,8 +78,7 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
table = [] # type: List[List[Scalar]]
for i, sheet_row in enumerate(sheet_rows):
- sheet_cells = [x for x in sheet_row.childNodes
- if x.qname in cell_names]
+ sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names]
empty_cells = 0
table_row = [] # type: List[Scalar]
@@ -122,12 +125,12 @@ def _get_row_repeat(self, row) -> int:
"""
from odf.namespaces import TABLENS
- return int(row.attributes.get((TABLENS, 'number-rows-repeated'), 1))
+ return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
def _get_column_repeat(self, cell) -> int:
from odf.namespaces import TABLENS
- return int(cell.attributes.get(
- (TABLENS, 'number-columns-repeated'), 1))
+
+ return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
def _is_empty_row(self, row) -> bool:
"""Helper function to find empty rows
@@ -140,18 +143,19 @@ def _is_empty_row(self, row) -> bool:
def _get_cell_value(self, cell, convert_float: bool) -> Scalar:
from odf.namespaces import OFFICENS
- cell_type = cell.attributes.get((OFFICENS, 'value-type'))
- if cell_type == 'boolean':
+
+ cell_type = cell.attributes.get((OFFICENS, "value-type"))
+ if cell_type == "boolean":
if str(cell) == "TRUE":
return True
return False
if cell_type is None:
return self.empty_value
- elif cell_type == 'float':
+ elif cell_type == "float":
# GH5394
- cell_value = float(cell.attributes.get((OFFICENS, 'value')))
+ cell_value = float(cell.attributes.get((OFFICENS, "value")))
- if cell_value == 0. and str(cell) != cell_value: # NA handling
+ if cell_value == 0.0 and str(cell) != cell_value: # NA handling
return str(cell)
if convert_float:
@@ -159,18 +163,18 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar:
if val == cell_value:
return val
return cell_value
- elif cell_type == 'percentage':
- cell_value = cell.attributes.get((OFFICENS, 'value'))
+ elif cell_type == "percentage":
+ cell_value = cell.attributes.get((OFFICENS, "value"))
return float(cell_value)
- elif cell_type == 'string':
+ elif cell_type == "string":
return str(cell)
- elif cell_type == 'currency':
- cell_value = cell.attributes.get((OFFICENS, 'value'))
+ elif cell_type == "currency":
+ cell_value = cell.attributes.get((OFFICENS, "value"))
return float(cell_value)
- elif cell_type == 'date':
- cell_value = cell.attributes.get((OFFICENS, 'date-value'))
+ elif cell_type == "date":
+ cell_value = cell.attributes.get((OFFICENS, "date-value"))
return pd.to_datetime(cell_value)
- elif cell_type == 'time':
+ elif cell_type == "time":
return pd.to_datetime(str(cell)).time()
else:
- raise ValueError('Unrecognized type {}'.format(cell_type))
+ raise ValueError("Unrecognized type {}".format(cell_type))
diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
index 7b1e203bd33ad..d8f5da5ab5bc6 100644
--- a/pandas/io/excel/_openpyxl.py
+++ b/pandas/io/excel/_openpyxl.py
@@ -11,17 +11,18 @@
class _OpenpyxlWriter(ExcelWriter):
- engine = 'openpyxl'
- supported_extensions = ('.xlsx', '.xlsm')
+ engine = "openpyxl"
+ supported_extensions = (".xlsx", ".xlsm")
- def __init__(self, path, engine=None, mode='w', **engine_kwargs):
+ def __init__(self, path, engine=None, mode="w", **engine_kwargs):
# Use the openpyxl module as the Excel writer.
from openpyxl.workbook import Workbook
super().__init__(path, mode=mode, **engine_kwargs)
- if self.mode == 'a': # Load from existing workbook
+ if self.mode == "a": # Load from existing workbook
from openpyxl import load_workbook
+
book = load_workbook(self.path)
self.book = book
else:
@@ -52,12 +53,16 @@ def _convert_to_style(cls, style_dict):
"""
from openpyxl.style import Style
+
xls_style = Style()
for key, value in style_dict.items():
for nk, nv in value.items():
if key == "borders":
- (xls_style.borders.__getattribute__(nk)
- .__setattr__('border_style', nv))
+ (
+ xls_style.borders.__getattribute__(nk).__setattr__(
+ "border_style", nv
+ )
+ )
else:
xls_style.__getattribute__(key).__setattr__(nk, nv)
@@ -86,16 +91,13 @@ def _convert_to_style_kwargs(cls, style_dict):
appropriate class.
"""
- _style_key_map = {
- 'borders': 'border',
- }
+ _style_key_map = {"borders": "border"}
style_kwargs = {}
for k, v in style_dict.items():
if k in _style_key_map:
k = _style_key_map[k]
- _conv_to_x = getattr(cls, '_convert_to_{k}'.format(k=k),
- lambda x: None)
+ _conv_to_x = getattr(cls, "_convert_to_{k}".format(k=k), lambda x: None)
new_v = _conv_to_x(v)
if new_v:
style_kwargs[k] = new_v
@@ -160,19 +162,19 @@ def _convert_to_font(cls, font_dict):
from openpyxl.styles import Font
_font_key_map = {
- 'sz': 'size',
- 'b': 'bold',
- 'i': 'italic',
- 'u': 'underline',
- 'strike': 'strikethrough',
- 'vertalign': 'vertAlign',
+ "sz": "size",
+ "b": "bold",
+ "i": "italic",
+ "u": "underline",
+ "strike": "strikethrough",
+ "vertalign": "vertAlign",
}
font_kwargs = {}
for k, v in font_dict.items():
if k in _font_key_map:
k = _font_key_map[k]
- if k == 'color':
+ if k == "color":
v = cls._convert_to_color(v)
font_kwargs[k] = v
@@ -222,17 +224,15 @@ def _convert_to_fill(cls, fill_dict):
from openpyxl.styles import PatternFill, GradientFill
_pattern_fill_key_map = {
- 'patternType': 'fill_type',
- 'patterntype': 'fill_type',
- 'fgColor': 'start_color',
- 'fgcolor': 'start_color',
- 'bgColor': 'end_color',
- 'bgcolor': 'end_color',
+ "patternType": "fill_type",
+ "patterntype": "fill_type",
+ "fgColor": "start_color",
+ "fgcolor": "start_color",
+ "bgColor": "end_color",
+ "bgcolor": "end_color",
}
- _gradient_fill_key_map = {
- 'fill_type': 'type',
- }
+ _gradient_fill_key_map = {"fill_type": "type"}
pfill_kwargs = {}
gfill_kwargs = {}
@@ -242,9 +242,9 @@ def _convert_to_fill(cls, fill_dict):
pk = _pattern_fill_key_map[k]
if k in _gradient_fill_key_map:
gk = _gradient_fill_key_map[k]
- if pk in ['start_color', 'end_color']:
+ if pk in ["start_color", "end_color"]:
v = cls._convert_to_color(v)
- if gk == 'stop':
+ if gk == "stop":
v = cls._convert_to_stop(v)
if pk:
pfill_kwargs[pk] = v
@@ -277,9 +277,7 @@ def _convert_to_side(cls, side_spec):
from openpyxl.styles import Side
- _side_key_map = {
- 'border_style': 'style',
- }
+ _side_key_map = {"border_style": "style"}
if isinstance(side_spec, str):
return Side(style=side_spec)
@@ -288,7 +286,7 @@ def _convert_to_side(cls, side_spec):
for k, v in side_spec.items():
if k in _side_key_map:
k = _side_key_map[k]
- if k == 'color':
+ if k == "color":
v = cls._convert_to_color(v)
side_kwargs[k] = v
@@ -320,18 +318,15 @@ def _convert_to_border(cls, border_dict):
from openpyxl.styles import Border
- _border_key_map = {
- 'diagonalup': 'diagonalUp',
- 'diagonaldown': 'diagonalDown',
- }
+ _border_key_map = {"diagonalup": "diagonalUp", "diagonaldown": "diagonalDown"}
border_kwargs = {}
for k, v in border_dict.items():
if k in _border_key_map:
k = _border_key_map[k]
- if k == 'color':
+ if k == "color":
v = cls._convert_to_color(v)
- if k in ['left', 'right', 'top', 'bottom', 'diagonal']:
+ if k in ["left", "right", "top", "bottom", "diagonal"]:
v = cls._convert_to_side(v)
border_kwargs[k] = v
@@ -374,7 +369,7 @@ def _convert_to_number_format(cls, number_format_dict):
-------
number_format : str
"""
- return number_format_dict['format_code']
+ return number_format_dict["format_code"]
@classmethod
def _convert_to_protection(cls, protection_dict):
@@ -394,8 +389,9 @@ def _convert_to_protection(cls, protection_dict):
return Protection(**protection_dict)
- def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
- freeze_panes=None):
+ def write_cells(
+ self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
+ ):
# Write the frame cells using openpyxl.
sheet_name = self._get_sheet_name(sheet_name)
@@ -409,13 +405,13 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
self.sheets[sheet_name] = wks
if _validate_freeze_panes(freeze_panes):
- wks.freeze_panes = wks.cell(row=freeze_panes[0] + 1,
- column=freeze_panes[1] + 1)
+ wks.freeze_panes = wks.cell(
+ row=freeze_panes[0] + 1, column=freeze_panes[1] + 1
+ )
for cell in cells:
xcell = wks.cell(
- row=startrow + cell.row + 1,
- column=startcol + cell.col + 1
+ row=startrow + cell.row + 1, column=startcol + cell.col + 1
)
xcell.value, fmt = self._value_with_fmt(cell.val)
if fmt:
@@ -439,7 +435,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
start_row=startrow + cell.row + 1,
start_column=startcol + cell.col + 1,
end_column=startcol + cell.mergeend + 1,
- end_row=startrow + cell.mergestart + 1
+ end_row=startrow + cell.mergestart + 1,
)
# When cells are merged only the top-left cell is preserved
@@ -462,7 +458,6 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
class _OpenpyxlReader(_BaseExcelReader):
-
def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None:
"""Reader using openpyxl engine.
@@ -477,12 +472,15 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None:
@property
def _workbook_class(self):
from openpyxl import Workbook
+
return Workbook
def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
from openpyxl import load_workbook
- return load_workbook(filepath_or_buffer,
- read_only=True, data_only=True, keep_links=False)
+
+ return load_workbook(
+ filepath_or_buffer, read_only=True, data_only=True, keep_links=False
+ )
@property
def sheet_names(self) -> List[str]:
@@ -499,13 +497,13 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
# TODO: replace with openpyxl constants
if cell.is_date:
return cell.value
- elif cell.data_type == 'e':
+ elif cell.data_type == "e":
return np.nan
- elif cell.data_type == 'b':
+ elif cell.data_type == "b":
return bool(cell.value)
elif cell.value is None:
- return '' # compat with xlrd
- elif cell.data_type == 'n':
+ return "" # compat with xlrd
+ elif cell.data_type == "n":
# GH5394
if convert_float:
val = int(cell.value)
@@ -519,7 +517,6 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
data = [] # type: List[List[Scalar]]
for row in sheet.rows:
- data.append(
- [self._convert_cell(cell, convert_float) for cell in row])
+ data.append([self._convert_cell(cell, convert_float) for cell in row])
return data
diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py
index 286efea9f120e..2ba3842d5c0c9 100644
--- a/pandas/io/excel/_util.py
+++ b/pandas/io/excel/_util.py
@@ -37,12 +37,12 @@ def _get_default_writer(ext):
str
The default engine for the extension.
"""
- _default_writers = {'xlsx': 'openpyxl', 'xlsm': 'openpyxl', 'xls': 'xlwt'}
- xlsxwriter = import_optional_dependency("xlsxwriter",
- raise_on_missing=False,
- on_version="warn")
+ _default_writers = {"xlsx": "openpyxl", "xlsm": "openpyxl", "xls": "xlwt"}
+ xlsxwriter = import_optional_dependency(
+ "xlsxwriter", raise_on_missing=False, on_version="warn"
+ )
if xlsxwriter:
- _default_writers['xlsx'] = 'xlsxwriter'
+ _default_writers["xlsx"] = "xlsxwriter"
return _default_writers[ext]
@@ -50,8 +50,7 @@ def get_writer(engine_name):
try:
return _writers[engine_name]
except KeyError:
- raise ValueError("No Excel writer '{engine}'"
- .format(engine=engine_name))
+ raise ValueError("No Excel writer '{engine}'".format(engine=engine_name))
def _excel2num(x):
@@ -137,10 +136,15 @@ def _maybe_convert_usecols(usecols):
return usecols
if is_integer(usecols):
- warnings.warn(("Passing in an integer for `usecols` has been "
- "deprecated. Please pass in a list of int from "
- "0 to `usecols` inclusive instead."),
- FutureWarning, stacklevel=2)
+ warnings.warn(
+ (
+ "Passing in an integer for `usecols` has been "
+ "deprecated. Please pass in a list of int from "
+ "0 to `usecols` inclusive instead."
+ ),
+ FutureWarning,
+ stacklevel=2,
+ )
return list(range(usecols + 1))
if isinstance(usecols, str):
@@ -151,14 +155,15 @@ def _maybe_convert_usecols(usecols):
def _validate_freeze_panes(freeze_panes):
if freeze_panes is not None:
- if (
- len(freeze_panes) == 2 and
- all(isinstance(item, int) for item in freeze_panes)
+ if len(freeze_panes) == 2 and all(
+ isinstance(item, int) for item in freeze_panes
):
return True
- raise ValueError("freeze_panes must be of form (row, column)"
- " where row and column are integers")
+ raise ValueError(
+ "freeze_panes must be of form (row, column)"
+ " where row and column are integers"
+ )
# freeze_panes wasn't specified, return False so it won't be applied
# to output sheet
@@ -168,7 +173,7 @@ def _validate_freeze_panes(freeze_panes):
def _trim_excel_header(row):
# trim header row so auto-index inference works
# xlrd uses '' , openpyxl None
- while len(row) > 0 and (row[0] == '' or row[0] is None):
+ while len(row) > 0 and (row[0] == "" or row[0] is None):
row = row[1:]
return row
@@ -195,7 +200,7 @@ def _fill_mi_header(row, control_row):
if not control_row[i]:
last = row[i]
- if row[i] == '' or row[i] is None:
+ if row[i] == "" or row[i] is None:
row[i] = last
else:
control_row[i] = False
@@ -228,4 +233,4 @@ def _pop_header_name(row, index_col):
header_name = row[i]
header_name = None if header_name == "" else header_name
- return header_name, row[:i] + [''] + row[i + 1:]
+ return header_name, row[:i] + [""] + row[i + 1 :]
diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py
index fcc432dc7a5ad..be1b78eeb146e 100644
--- a/pandas/io/excel/_xlrd.py
+++ b/pandas/io/excel/_xlrd.py
@@ -8,7 +8,6 @@
class _XlrdReader(_BaseExcelReader):
-
def __init__(self, filepath_or_buffer):
"""Reader using xlrd engine.
@@ -24,10 +23,12 @@ def __init__(self, filepath_or_buffer):
@property
def _workbook_class(self):
from xlrd import Book
+
return Book
def load_workbook(self, filepath_or_buffer):
from xlrd import open_workbook
+
if hasattr(filepath_or_buffer, "read"):
data = filepath_or_buffer.read()
return open_workbook(file_contents=data)
@@ -45,9 +46,13 @@ def get_sheet_by_index(self, index):
return self.book.sheet_by_index(index)
def get_sheet_data(self, sheet, convert_float):
- from xlrd import (xldate, XL_CELL_DATE,
- XL_CELL_ERROR, XL_CELL_BOOLEAN,
- XL_CELL_NUMBER)
+ from xlrd import (
+ xldate,
+ XL_CELL_DATE,
+ XL_CELL_ERROR,
+ XL_CELL_BOOLEAN,
+ XL_CELL_NUMBER,
+ )
epoch1904 = self.book.datemode
@@ -59,8 +64,7 @@ def _parse_cell(cell_contents, cell_typ):
# Use the newer xlrd datetime handling.
try:
- cell_contents = xldate.xldate_as_datetime(
- cell_contents, epoch1904)
+ cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904)
except OverflowError:
return cell_contents
@@ -68,12 +72,15 @@ def _parse_cell(cell_contents, cell_typ):
# so we treat dates on the epoch as times only.
# Also, Excel supports 1900 and 1904 epochs.
year = (cell_contents.timetuple())[0:3]
- if ((not epoch1904 and year == (1899, 12, 31)) or
- (epoch1904 and year == (1904, 1, 1))):
- cell_contents = time(cell_contents.hour,
- cell_contents.minute,
- cell_contents.second,
- cell_contents.microsecond)
+ if (not epoch1904 and year == (1899, 12, 31)) or (
+ epoch1904 and year == (1904, 1, 1)
+ ):
+ cell_contents = time(
+ cell_contents.hour,
+ cell_contents.minute,
+ cell_contents.second,
+ cell_contents.microsecond,
+ )
elif cell_typ == XL_CELL_ERROR:
cell_contents = np.nan
@@ -90,9 +97,10 @@ def _parse_cell(cell_contents, cell_typ):
data = []
for i in range(sheet.nrows):
- row = [_parse_cell(value, typ)
- for value, typ in zip(sheet.row_values(i),
- sheet.row_types(i))]
+ row = [
+ _parse_cell(value, typ)
+ for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
+ ]
data.append(row)
return data
diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py
index 2ddfcf3de5a8f..07bf265da4863 100644
--- a/pandas/io/excel/_xlsxwriter.py
+++ b/pandas/io/excel/_xlsxwriter.py
@@ -9,75 +9,69 @@ class _XlsxStyler:
# Ordering necessary for both determinism and because some are keyed by
# prefixes of others.
STYLE_MAPPING = {
- 'font': [
- (('name',), 'font_name'),
- (('sz',), 'font_size'),
- (('size',), 'font_size'),
- (('color', 'rgb',), 'font_color'),
- (('color',), 'font_color'),
- (('b',), 'bold'),
- (('bold',), 'bold'),
- (('i',), 'italic'),
- (('italic',), 'italic'),
- (('u',), 'underline'),
- (('underline',), 'underline'),
- (('strike',), 'font_strikeout'),
- (('vertAlign',), 'font_script'),
- (('vertalign',), 'font_script'),
+ "font": [
+ (("name",), "font_name"),
+ (("sz",), "font_size"),
+ (("size",), "font_size"),
+ (("color", "rgb"), "font_color"),
+ (("color",), "font_color"),
+ (("b",), "bold"),
+ (("bold",), "bold"),
+ (("i",), "italic"),
+ (("italic",), "italic"),
+ (("u",), "underline"),
+ (("underline",), "underline"),
+ (("strike",), "font_strikeout"),
+ (("vertAlign",), "font_script"),
+ (("vertalign",), "font_script"),
],
- 'number_format': [
- (('format_code',), 'num_format'),
- ((), 'num_format',),
+ "number_format": [(("format_code",), "num_format"), ((), "num_format")],
+ "protection": [(("locked",), "locked"), (("hidden",), "hidden")],
+ "alignment": [
+ (("horizontal",), "align"),
+ (("vertical",), "valign"),
+ (("text_rotation",), "rotation"),
+ (("wrap_text",), "text_wrap"),
+ (("indent",), "indent"),
+ (("shrink_to_fit",), "shrink"),
],
- 'protection': [
- (('locked',), 'locked'),
- (('hidden',), 'hidden'),
+ "fill": [
+ (("patternType",), "pattern"),
+ (("patterntype",), "pattern"),
+ (("fill_type",), "pattern"),
+ (("start_color", "rgb"), "fg_color"),
+ (("fgColor", "rgb"), "fg_color"),
+ (("fgcolor", "rgb"), "fg_color"),
+ (("start_color",), "fg_color"),
+ (("fgColor",), "fg_color"),
+ (("fgcolor",), "fg_color"),
+ (("end_color", "rgb"), "bg_color"),
+ (("bgColor", "rgb"), "bg_color"),
+ (("bgcolor", "rgb"), "bg_color"),
+ (("end_color",), "bg_color"),
+ (("bgColor",), "bg_color"),
+ (("bgcolor",), "bg_color"),
],
- 'alignment': [
- (('horizontal',), 'align'),
- (('vertical',), 'valign'),
- (('text_rotation',), 'rotation'),
- (('wrap_text',), 'text_wrap'),
- (('indent',), 'indent'),
- (('shrink_to_fit',), 'shrink'),
- ],
- 'fill': [
- (('patternType',), 'pattern'),
- (('patterntype',), 'pattern'),
- (('fill_type',), 'pattern'),
- (('start_color', 'rgb',), 'fg_color'),
- (('fgColor', 'rgb',), 'fg_color'),
- (('fgcolor', 'rgb',), 'fg_color'),
- (('start_color',), 'fg_color'),
- (('fgColor',), 'fg_color'),
- (('fgcolor',), 'fg_color'),
- (('end_color', 'rgb',), 'bg_color'),
- (('bgColor', 'rgb',), 'bg_color'),
- (('bgcolor', 'rgb',), 'bg_color'),
- (('end_color',), 'bg_color'),
- (('bgColor',), 'bg_color'),
- (('bgcolor',), 'bg_color'),
- ],
- 'border': [
- (('color', 'rgb',), 'border_color'),
- (('color',), 'border_color'),
- (('style',), 'border'),
- (('top', 'color', 'rgb',), 'top_color'),
- (('top', 'color',), 'top_color'),
- (('top', 'style',), 'top'),
- (('top',), 'top'),
- (('right', 'color', 'rgb',), 'right_color'),
- (('right', 'color',), 'right_color'),
- (('right', 'style',), 'right'),
- (('right',), 'right'),
- (('bottom', 'color', 'rgb',), 'bottom_color'),
- (('bottom', 'color',), 'bottom_color'),
- (('bottom', 'style',), 'bottom'),
- (('bottom',), 'bottom'),
- (('left', 'color', 'rgb',), 'left_color'),
- (('left', 'color',), 'left_color'),
- (('left', 'style',), 'left'),
- (('left',), 'left'),
+ "border": [
+ (("color", "rgb"), "border_color"),
+ (("color",), "border_color"),
+ (("style",), "border"),
+ (("top", "color", "rgb"), "top_color"),
+ (("top", "color"), "top_color"),
+ (("top", "style"), "top"),
+ (("top",), "top"),
+ (("right", "color", "rgb"), "right_color"),
+ (("right", "color"), "right_color"),
+ (("right", "style"), "right"),
+ (("right",), "right"),
+ (("bottom", "color", "rgb"), "bottom_color"),
+ (("bottom", "color"), "bottom_color"),
+ (("bottom", "style"), "bottom"),
+ (("bottom",), "bottom"),
+ (("left", "color", "rgb"), "left_color"),
+ (("left", "color"), "left_color"),
+ (("left", "style"), "left"),
+ (("left",), "left"),
],
}
@@ -96,14 +90,14 @@ def convert(cls, style_dict, num_format_str=None):
props = {}
if num_format_str is not None:
- props['num_format'] = num_format_str
+ props["num_format"] = num_format_str
if style_dict is None:
return props
- if 'borders' in style_dict:
+ if "borders" in style_dict:
style_dict = style_dict.copy()
- style_dict['border'] = style_dict.pop('borders')
+ style_dict["border"] = style_dict.pop("borders")
for style_group_key, style_group in style_dict.items():
for src, dst in cls.STYLE_MAPPING.get(style_group_key, []):
@@ -120,51 +114,76 @@ def convert(cls, style_dict, num_format_str=None):
else:
props[dst] = v
- if isinstance(props.get('pattern'), str):
+ if isinstance(props.get("pattern"), str):
# TODO: support other fill patterns
- props['pattern'] = 0 if props['pattern'] == 'none' else 1
+ props["pattern"] = 0 if props["pattern"] == "none" else 1
- for k in ['border', 'top', 'right', 'bottom', 'left']:
+ for k in ["border", "top", "right", "bottom", "left"]:
if isinstance(props.get(k), str):
try:
- props[k] = ['none', 'thin', 'medium', 'dashed', 'dotted',
- 'thick', 'double', 'hair', 'mediumDashed',
- 'dashDot', 'mediumDashDot', 'dashDotDot',
- 'mediumDashDotDot',
- 'slantDashDot'].index(props[k])
+ props[k] = [
+ "none",
+ "thin",
+ "medium",
+ "dashed",
+ "dotted",
+ "thick",
+ "double",
+ "hair",
+ "mediumDashed",
+ "dashDot",
+ "mediumDashDot",
+ "dashDotDot",
+ "mediumDashDotDot",
+ "slantDashDot",
+ ].index(props[k])
except ValueError:
props[k] = 2
- if isinstance(props.get('font_script'), str):
- props['font_script'] = ['baseline', 'superscript',
- 'subscript'].index(props['font_script'])
+ if isinstance(props.get("font_script"), str):
+ props["font_script"] = ["baseline", "superscript", "subscript"].index(
+ props["font_script"]
+ )
- if isinstance(props.get('underline'), str):
- props['underline'] = {'none': 0, 'single': 1, 'double': 2,
- 'singleAccounting': 33,
- 'doubleAccounting': 34}[props['underline']]
+ if isinstance(props.get("underline"), str):
+ props["underline"] = {
+ "none": 0,
+ "single": 1,
+ "double": 2,
+ "singleAccounting": 33,
+ "doubleAccounting": 34,
+ }[props["underline"]]
return props
class _XlsxWriter(ExcelWriter):
- engine = 'xlsxwriter'
- supported_extensions = ('.xlsx',)
-
- def __init__(self, path, engine=None,
- date_format=None, datetime_format=None, mode='w',
- **engine_kwargs):
+ engine = "xlsxwriter"
+ supported_extensions = (".xlsx",)
+
+ def __init__(
+ self,
+ path,
+ engine=None,
+ date_format=None,
+ datetime_format=None,
+ mode="w",
+ **engine_kwargs
+ ):
# Use the xlsxwriter module as the Excel writer.
import xlsxwriter
- if mode == 'a':
- raise ValueError('Append mode is not supported with xlsxwriter!')
+ if mode == "a":
+ raise ValueError("Append mode is not supported with xlsxwriter!")
- super().__init__(path, engine=engine,
- date_format=date_format,
- datetime_format=datetime_format,
- mode=mode,
- **engine_kwargs)
+ super().__init__(
+ path,
+ engine=engine,
+ date_format=date_format,
+ datetime_format=datetime_format,
+ mode=mode,
+ **engine_kwargs
+ )
self.book = xlsxwriter.Workbook(path, **engine_kwargs)
@@ -175,8 +194,9 @@ def save(self):
return self.book.close()
- def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
- freeze_panes=None):
+ def write_cells(
+ self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
+ ):
# Write the frame cells using xlsxwriter.
sheet_name = self._get_sheet_name(sheet_name)
@@ -186,7 +206,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
wks = self.book.add_worksheet(sheet_name)
self.sheets[sheet_name] = wks
- style_dict = {'null': None}
+ style_dict = {"null": None}
if _validate_freeze_panes(freeze_panes):
wks.freeze_panes(*(freeze_panes))
@@ -201,17 +221,17 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
if stylekey in style_dict:
style = style_dict[stylekey]
else:
- style = self.book.add_format(
- _XlsxStyler.convert(cell.style, fmt))
+ style = self.book.add_format(_XlsxStyler.convert(cell.style, fmt))
style_dict[stylekey] = style
if cell.mergestart is not None and cell.mergeend is not None:
- wks.merge_range(startrow + cell.row,
- startcol + cell.col,
- startrow + cell.mergestart,
- startcol + cell.mergeend,
- val, style)
+ wks.merge_range(
+ startrow + cell.row,
+ startcol + cell.col,
+ startrow + cell.mergestart,
+ startcol + cell.mergeend,
+ val,
+ style,
+ )
else:
- wks.write(startrow + cell.row,
- startcol + cell.col,
- val, style)
+ wks.write(startrow + cell.row, startcol + cell.col, val, style)
diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py
index 62a57b99fe556..fe3d0a208de6a 100644
--- a/pandas/io/excel/_xlwt.py
+++ b/pandas/io/excel/_xlwt.py
@@ -5,22 +5,22 @@
class _XlwtWriter(ExcelWriter):
- engine = 'xlwt'
- supported_extensions = ('.xls',)
+ engine = "xlwt"
+ supported_extensions = (".xls",)
- def __init__(self, path, engine=None, encoding=None, mode='w',
- **engine_kwargs):
+ def __init__(self, path, engine=None, encoding=None, mode="w", **engine_kwargs):
# Use the xlwt module as the Excel writer.
import xlwt
- engine_kwargs['engine'] = engine
- if mode == 'a':
- raise ValueError('Append mode is not supported with xlwt!')
+ engine_kwargs["engine"] = engine
+
+ if mode == "a":
+ raise ValueError("Append mode is not supported with xlwt!")
super().__init__(path, mode=mode, **engine_kwargs)
if encoding is None:
- encoding = 'ascii'
+ encoding = "ascii"
self.book = xlwt.Workbook(encoding=encoding)
self.fm_datetime = xlwt.easyxf(num_format_str=self.datetime_format)
self.fm_date = xlwt.easyxf(num_format_str=self.date_format)
@@ -31,8 +31,9 @@ def save(self):
"""
return self.book.save(self.path)
- def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
- freeze_panes=None):
+ def write_cells(
+ self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None
+ ):
# Write the frame cells using xlwt.
sheet_name = self._get_sheet_name(sheet_name)
@@ -64,19 +65,19 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
style_dict[stylekey] = style
if cell.mergestart is not None and cell.mergeend is not None:
- wks.write_merge(startrow + cell.row,
- startrow + cell.mergestart,
- startcol + cell.col,
- startcol + cell.mergeend,
- val, style)
+ wks.write_merge(
+ startrow + cell.row,
+ startrow + cell.mergestart,
+ startcol + cell.col,
+ startcol + cell.mergeend,
+ val,
+ style,
+ )
else:
- wks.write(startrow + cell.row,
- startcol + cell.col,
- val, style)
+ wks.write(startrow + cell.row, startcol + cell.col, val, style)
@classmethod
- def _style_to_xlwt(cls, item, firstlevel=True, field_sep=',',
- line_sep=';'):
+ def _style_to_xlwt(cls, item, firstlevel=True, field_sep=",", line_sep=";"):
"""helper which recursively generate an xlwt easy style string
for example:
@@ -91,17 +92,19 @@ def _style_to_xlwt(cls, item, firstlevel=True, field_sep=',',
border: top thin, right thin, bottom thin, left thin; \
align: horiz center;
"""
- if hasattr(item, 'items'):
+ if hasattr(item, "items"):
if firstlevel:
- it = ["{key}: {val}"
- .format(key=key, val=cls._style_to_xlwt(value, False))
- for key, value in item.items()]
+ it = [
+ "{key}: {val}".format(key=key, val=cls._style_to_xlwt(value, False))
+ for key, value in item.items()
+ ]
out = "{sep} ".format(sep=(line_sep).join(it))
return out
else:
- it = ["{key} {val}"
- .format(key=key, val=cls._style_to_xlwt(value, False))
- for key, value in item.items()]
+ it = [
+ "{key} {val}".format(key=key, val=cls._style_to_xlwt(value, False))
+ for key, value in item.items()
+ ]
out = "{sep} ".format(sep=(field_sep).join(it))
return out
else:
@@ -123,7 +126,7 @@ def _convert_to_style(cls, style_dict, num_format_str=None):
if style_dict:
xlwt_stylestr = cls._style_to_xlwt(style_dict)
- style = xlwt.easyxf(xlwt_stylestr, field_sep=',', line_sep=';')
+ style = xlwt.easyxf(xlwt_stylestr, field_sep=",", line_sep=";")
else:
style = xlwt.XFStyle()
if num_format_str is not None:
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
index 93252f3a09ceb..05608f69c0d9d 100644
--- a/pandas/io/feather_format.py
+++ b/pandas/io/feather_format.py
@@ -28,7 +28,7 @@ def to_feather(df, path):
if not isinstance(df, DataFrame):
raise ValueError("feather only support IO with DataFrames")
- valid_types = {'string', 'unicode'}
+ valid_types = {"string", "unicode"}
# validate index
# --------------
@@ -37,20 +37,24 @@ def to_feather(df, path):
# raise on anything else as we don't serialize the index
if not isinstance(df.index, Int64Index):
- raise ValueError("feather does not support serializing {} "
- "for the index; you can .reset_index()"
- "to make the index into column(s)".format(
- type(df.index)))
+ raise ValueError(
+ "feather does not support serializing {} "
+ "for the index; you can .reset_index()"
+ "to make the index into column(s)".format(type(df.index))
+ )
if not df.index.equals(RangeIndex.from_range(range(len(df)))):
- raise ValueError("feather does not support serializing a "
- "non-default index for the index; you "
- "can .reset_index() to make the index "
- "into column(s)")
+ raise ValueError(
+ "feather does not support serializing a "
+ "non-default index for the index; you "
+ "can .reset_index() to make the index "
+ "into column(s)"
+ )
if df.index.name is not None:
- raise ValueError("feather does not serialize index meta-data on a "
- "default index")
+ raise ValueError(
+ "feather does not serialize index meta-data on a " "default index"
+ )
# validate columns
# ----------------
@@ -62,7 +66,7 @@ def to_feather(df, path):
feather.write_feather(df, path)
-@deprecate_kwarg(old_arg_name='nthreads', new_arg_name='use_threads')
+@deprecate_kwarg(old_arg_name="nthreads", new_arg_name="use_threads")
def read_feather(path, columns=None, use_threads=True):
"""
Load a feather-format object from the file path
@@ -95,12 +99,10 @@ def read_feather(path, columns=None, use_threads=True):
path = _stringify_path(path)
- if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'):
+ if LooseVersion(pyarrow.__version__) < LooseVersion("0.11.0"):
int_use_threads = int(use_threads)
if int_use_threads < 1:
int_use_threads = 1
- return feather.read_feather(path, columns=columns,
- nthreads=int_use_threads)
+ return feather.read_feather(path, columns=columns, nthreads=int_use_threads)
- return feather.read_feather(path, columns=columns,
- use_threads=bool(use_threads))
+ return feather.read_feather(path, columns=columns, use_threads=bool(use_threads))
diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py
index 19c822e5dc270..7f8f2fbea2352 100644
--- a/pandas/io/formats/console.py
+++ b/pandas/io/formats/console.py
@@ -12,9 +12,9 @@ def get_console_size():
"""
from pandas import get_option
- display_width = get_option('display.width')
+ display_width = get_option("display.width")
# deprecated.
- display_height = get_option('display.max_rows')
+ display_height = get_option("display.max_rows")
# Consider
# interactive shell terminal, can detect term size
@@ -31,8 +31,9 @@ def get_console_size():
# sane defaults for interactive non-shell terminal
# match default for width,height in config_init
from pandas._config.config import get_default_val
- terminal_width = get_default_val('display.width')
- terminal_height = get_default_val('display.max_rows')
+
+ terminal_width = get_default_val("display.width")
+ terminal_height = get_default_val("display.max_rows")
else:
# pure terminal
terminal_width, terminal_height = get_terminal_size()
@@ -48,6 +49,7 @@ def get_console_size():
# ----------------------------------------------------------------------
# Detect our environment
+
def in_interactive_session():
""" check if we're running in an interactive shell
@@ -59,9 +61,8 @@ def check_main():
try:
import __main__ as main
except ModuleNotFoundError:
- return get_option('mode.sim_interactive')
- return (not hasattr(main, '__file__') or
- get_option('mode.sim_interactive'))
+ return get_option("mode.sim_interactive")
+ return not hasattr(main, "__file__") or get_option("mode.sim_interactive")
try:
return __IPYTHON__ or check_main() # noqa
@@ -75,7 +76,7 @@ def in_ipython_frontend():
"""
try:
ip = get_ipython() # noqa
- return 'zmq' in str(type(ip)).lower()
+ return "zmq" in str(type(ip)).lower()
except NameError:
pass
diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py
index 2527e45650ea3..92fe87cddb35b 100644
--- a/pandas/io/formats/css.py
+++ b/pandas/io/formats/css.py
@@ -7,6 +7,7 @@
class CSSWarning(UserWarning):
"""This CSS syntax cannot currently be parsed"""
+
pass
@@ -63,9 +64,9 @@ def __call__(self, declarations_str, inherited=None):
props[prop] = val
for prop, val in list(props.items()):
- if val == 'inherit':
- val = inherited.get(prop, 'initial')
- if val == 'initial':
+ if val == "inherit":
+ val = inherited.get(prop, "initial")
+ if val == "initial":
val = None
if val is None:
@@ -75,90 +76,94 @@ def __call__(self, declarations_str, inherited=None):
props[prop] = val
# 2. resolve relative font size
- if props.get('font-size'):
- if 'font-size' in inherited:
- em_pt = inherited['font-size']
- assert em_pt[-2:] == 'pt'
+ if props.get("font-size"):
+ if "font-size" in inherited:
+ em_pt = inherited["font-size"]
+ assert em_pt[-2:] == "pt"
em_pt = float(em_pt[:-2])
else:
em_pt = None
- props['font-size'] = self.size_to_pt(
- props['font-size'], em_pt, conversions=self.FONT_SIZE_RATIOS)
+ props["font-size"] = self.size_to_pt(
+ props["font-size"], em_pt, conversions=self.FONT_SIZE_RATIOS
+ )
- font_size = float(props['font-size'][:-2])
+ font_size = float(props["font-size"][:-2])
else:
font_size = None
# 3. TODO: resolve other font-relative units
for side in self.SIDES:
- prop = 'border-{side}-width'.format(side=side)
+ prop = "border-{side}-width".format(side=side)
if prop in props:
props[prop] = self.size_to_pt(
- props[prop], em_pt=font_size,
- conversions=self.BORDER_WIDTH_RATIOS)
- for prop in ['margin-{side}'.format(side=side),
- 'padding-{side}'.format(side=side)]:
+ props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS
+ )
+ for prop in [
+ "margin-{side}".format(side=side),
+ "padding-{side}".format(side=side),
+ ]:
if prop in props:
# TODO: support %
props[prop] = self.size_to_pt(
- props[prop], em_pt=font_size,
- conversions=self.MARGIN_RATIOS)
+ props[prop], em_pt=font_size, conversions=self.MARGIN_RATIOS
+ )
return props
UNIT_RATIOS = {
- 'rem': ('pt', 12),
- 'ex': ('em', .5),
+ "rem": ("pt", 12),
+ "ex": ("em", 0.5),
# 'ch':
- 'px': ('pt', .75),
- 'pc': ('pt', 12),
- 'in': ('pt', 72),
- 'cm': ('in', 1 / 2.54),
- 'mm': ('in', 1 / 25.4),
- 'q': ('mm', .25),
- '!!default': ('em', 0),
+ "px": ("pt", 0.75),
+ "pc": ("pt", 12),
+ "in": ("pt", 72),
+ "cm": ("in", 1 / 2.54),
+ "mm": ("in", 1 / 25.4),
+ "q": ("mm", 0.25),
+ "!!default": ("em", 0),
}
FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
- FONT_SIZE_RATIOS.update({
- '%': ('em', .01),
- 'xx-small': ('rem', .5),
- 'x-small': ('rem', .625),
- 'small': ('rem', .8),
- 'medium': ('rem', 1),
- 'large': ('rem', 1.125),
- 'x-large': ('rem', 1.5),
- 'xx-large': ('rem', 2),
- 'smaller': ('em', 1 / 1.2),
- 'larger': ('em', 1.2),
- '!!default': ('em', 1),
- })
+ FONT_SIZE_RATIOS.update(
+ {
+ "%": ("em", 0.01),
+ "xx-small": ("rem", 0.5),
+ "x-small": ("rem", 0.625),
+ "small": ("rem", 0.8),
+ "medium": ("rem", 1),
+ "large": ("rem", 1.125),
+ "x-large": ("rem", 1.5),
+ "xx-large": ("rem", 2),
+ "smaller": ("em", 1 / 1.2),
+ "larger": ("em", 1.2),
+ "!!default": ("em", 1),
+ }
+ )
MARGIN_RATIOS = UNIT_RATIOS.copy()
- MARGIN_RATIOS.update({
- 'none': ('pt', 0),
- })
+ MARGIN_RATIOS.update({"none": ("pt", 0)})
BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
- BORDER_WIDTH_RATIOS.update({
- 'none': ('pt', 0),
- 'thick': ('px', 4),
- 'medium': ('px', 2),
- 'thin': ('px', 1),
- # Default: medium only if solid
- })
+ BORDER_WIDTH_RATIOS.update(
+ {
+ "none": ("pt", 0),
+ "thick": ("px", 4),
+ "medium": ("px", 2),
+ "thin": ("px", 1),
+ # Default: medium only if solid
+ }
+ )
def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS):
def _error():
- warnings.warn('Unhandled size: {val!r}'.format(val=in_val),
- CSSWarning)
- return self.size_to_pt('1!!default', conversions=conversions)
+ warnings.warn("Unhandled size: {val!r}".format(val=in_val), CSSWarning)
+ return self.size_to_pt("1!!default", conversions=conversions)
try:
- val, unit = re.match(r'^(\S*?)([a-zA-Z%!].*)', in_val).groups()
+ val, unit = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val).groups()
except AttributeError:
return _error()
- if val == '':
+ if val == "":
# hack for 'large' etc.
val = 1
else:
@@ -167,13 +172,13 @@ def _error():
except ValueError:
return _error()
- while unit != 'pt':
- if unit == 'em':
+ while unit != "pt":
+ if unit == "em":
if em_pt is None:
- unit = 'rem'
+ unit = "rem"
else:
val *= em_pt
- unit = 'pt'
+ unit = "pt"
continue
try:
@@ -184,14 +189,14 @@ def _error():
val = round(val, 5)
if int(val) == val:
- size_fmt = '{fmt:d}pt'.format(fmt=int(val))
+ size_fmt = "{fmt:d}pt".format(fmt=int(val))
else:
- size_fmt = '{fmt:f}pt'.format(fmt=val)
+ size_fmt = "{fmt:f}pt".format(fmt=val)
return size_fmt
def atomize(self, declarations):
for prop, value in declarations:
- attr = 'expand_' + prop.replace('-', '_')
+ attr = "expand_" + prop.replace("-", "_")
try:
expand = getattr(self, attr)
except AttributeError:
@@ -206,7 +211,7 @@ def atomize(self, declarations):
3: [0, 1, 2, 1],
4: [0, 1, 2, 3],
}
- SIDES = ('top', 'right', 'bottom', 'left')
+ SIDES = ("top", "right", "bottom", "left")
def _side_expander(prop_fmt):
def expand(self, prop, value):
@@ -214,34 +219,39 @@ def expand(self, prop, value):
try:
mapping = self.SIDE_SHORTHANDS[len(tokens)]
except KeyError:
- warnings.warn('Could not expand "{prop}: {val}"'
- .format(prop=prop, val=value), CSSWarning)
+ warnings.warn(
+ 'Could not expand "{prop}: {val}"'.format(prop=prop, val=value),
+ CSSWarning,
+ )
return
for key, idx in zip(self.SIDES, mapping):
yield prop_fmt.format(key), tokens[idx]
return expand
- expand_border_color = _side_expander('border-{:s}-color')
- expand_border_style = _side_expander('border-{:s}-style')
- expand_border_width = _side_expander('border-{:s}-width')
- expand_margin = _side_expander('margin-{:s}')
- expand_padding = _side_expander('padding-{:s}')
+ expand_border_color = _side_expander("border-{:s}-color")
+ expand_border_style = _side_expander("border-{:s}-style")
+ expand_border_width = _side_expander("border-{:s}-width")
+ expand_margin = _side_expander("margin-{:s}")
+ expand_padding = _side_expander("padding-{:s}")
def parse(self, declarations_str):
"""Generates (prop, value) pairs from declarations
In a future version may generate parsed tokens from tinycss/tinycss2
"""
- for decl in declarations_str.split(';'):
+ for decl in declarations_str.split(";"):
if not decl.strip():
continue
- prop, sep, val = decl.partition(':')
+ prop, sep, val = decl.partition(":")
prop = prop.strip().lower()
# TODO: don't lowercase case sensitive parts of values (strings)
val = val.strip().lower()
if sep:
yield prop, val
else:
- warnings.warn('Ill-formatted attribute: expected a colon '
- 'in {decl!r}'.format(decl=decl), CSSWarning)
+ warnings.warn(
+ "Ill-formatted attribute: expected a colon "
+ "in {decl!r}".format(decl=decl),
+ CSSWarning,
+ )
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
index e1d95862ec872..d86bf432b83c4 100644
--- a/pandas/io/formats/csvs.py
+++ b/pandas/io/formats/csvs.py
@@ -13,22 +13,45 @@
from pandas._libs import writers as libwriters
from pandas.core.dtypes.generic import (
- ABCDatetimeIndex, ABCIndexClass, ABCMultiIndex, ABCPeriodIndex)
+ ABCDatetimeIndex,
+ ABCIndexClass,
+ ABCMultiIndex,
+ ABCPeriodIndex,
+)
from pandas.core.dtypes.missing import notna
from pandas.io.common import (
- UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer)
+ UnicodeWriter,
+ _get_handle,
+ _infer_compression,
+ get_filepath_or_buffer,
+)
class CSVFormatter:
-
- def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
- float_format=None, cols=None, header=True, index=True,
- index_label=None, mode='w', encoding=None,
- compression='infer', quoting=None, line_terminator='\n',
- chunksize=None, quotechar='"',
- date_format=None, doublequote=True, escapechar=None,
- decimal='.'):
+ def __init__(
+ self,
+ obj,
+ path_or_buf=None,
+ sep=",",
+ na_rep="",
+ float_format=None,
+ cols=None,
+ header=True,
+ index=True,
+ index_label=None,
+ mode="w",
+ encoding=None,
+ compression="infer",
+ quoting=None,
+ line_terminator="\n",
+ chunksize=None,
+ quotechar='"',
+ date_format=None,
+ doublequote=True,
+ escapechar=None,
+ decimal=".",
+ ):
self.obj = obj
@@ -48,7 +71,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
self.index_label = index_label
self.mode = mode
if encoding is None:
- encoding = 'utf-8'
+ encoding = "utf-8"
self.encoding = encoding
self.compression = _infer_compression(self.path_or_buf, compression)
@@ -73,15 +96,18 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
# validate mi options
if self.has_mi_columns:
if cols is not None:
- raise TypeError("cannot specify cols with a MultiIndex on the "
- "columns")
+ raise TypeError(
+ "cannot specify cols with a MultiIndex on the " "columns"
+ )
if cols is not None:
if isinstance(cols, ABCIndexClass):
- cols = cols.to_native_types(na_rep=na_rep,
- float_format=float_format,
- date_format=date_format,
- quoting=self.quoting)
+ cols = cols.to_native_types(
+ na_rep=na_rep,
+ float_format=float_format,
+ date_format=date_format,
+ quoting=self.quoting,
+ )
else:
cols = list(cols)
self.obj = self.obj.loc[:, cols]
@@ -90,10 +116,12 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
# and make sure sure cols is just a list of labels
cols = self.obj.columns
if isinstance(cols, ABCIndexClass):
- cols = cols.to_native_types(na_rep=na_rep,
- float_format=float_format,
- date_format=date_format,
- quoting=self.quoting)
+ cols = cols.to_native_types(
+ na_rep=na_rep,
+ float_format=float_format,
+ date_format=date_format,
+ quoting=self.quoting,
+ )
else:
cols = list(cols)
@@ -110,13 +138,17 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
self.chunksize = int(chunksize)
self.data_index = obj.index
- if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and
- date_format is not None):
+ if (
+ isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex))
+ and date_format is not None
+ ):
from pandas import Index
- self.data_index = Index([x.strftime(date_format) if notna(x) else
- '' for x in self.data_index])
- self.nlevels = getattr(self.data_index, 'nlevels', 1)
+ self.data_index = Index(
+ [x.strftime(date_format) if notna(x) else "" for x in self.data_index]
+ )
+
+ self.nlevels = getattr(self.data_index, "nlevels", 1)
if not index:
self.nlevels = 0
@@ -125,15 +157,14 @@ def save(self):
Create the writer & save
"""
# GH21227 internal compression is not used when file-like passed.
- if self.compression and hasattr(self.path_or_buf, 'write'):
- msg = ("compression has no effect when passing file-like "
- "object as input.")
+ if self.compression and hasattr(self.path_or_buf, "write"):
+ msg = "compression has no effect when passing file-like " "object as input."
warnings.warn(msg, RuntimeWarning, stacklevel=2)
# when zip compression is called.
is_zip = isinstance(self.path_or_buf, ZipFile) or (
- not hasattr(self.path_or_buf, 'write')
- and self.compression == 'zip')
+ not hasattr(self.path_or_buf, "write") and self.compression == "zip"
+ )
if is_zip:
# zipfile doesn't support writing string to archive. uses string
@@ -141,25 +172,31 @@ def save(self):
# file handle. GH21241, GH21118
f = StringIO()
close = False
- elif hasattr(self.path_or_buf, 'write'):
+ elif hasattr(self.path_or_buf, "write"):
f = self.path_or_buf
close = False
else:
- f, handles = _get_handle(self.path_or_buf, self.mode,
- encoding=self.encoding,
- compression=self.compression)
+ f, handles = _get_handle(
+ self.path_or_buf,
+ self.mode,
+ encoding=self.encoding,
+ compression=self.compression,
+ )
close = True
try:
- writer_kwargs = dict(lineterminator=self.line_terminator,
- delimiter=self.sep, quoting=self.quoting,
- doublequote=self.doublequote,
- escapechar=self.escapechar,
- quotechar=self.quotechar)
- if self.encoding == 'ascii':
+ writer_kwargs = dict(
+ lineterminator=self.line_terminator,
+ delimiter=self.sep,
+ quoting=self.quoting,
+ doublequote=self.doublequote,
+ escapechar=self.escapechar,
+ quotechar=self.quotechar,
+ )
+ if self.encoding == "ascii":
self.writer = csvlib.writer(f, **writer_kwargs)
else:
- writer_kwargs['encoding'] = self.encoding
+ writer_kwargs["encoding"] = self.encoding
self.writer = UnicodeWriter(f, **writer_kwargs)
self._save()
@@ -168,12 +205,15 @@ def save(self):
if is_zip:
# GH17778 handles zip compression separately.
buf = f.getvalue()
- if hasattr(self.path_or_buf, 'write'):
+ if hasattr(self.path_or_buf, "write"):
self.path_or_buf.write(buf)
else:
- f, handles = _get_handle(self.path_or_buf, self.mode,
- encoding=self.encoding,
- compression=self.compression)
+ f, handles = _get_handle(
+ self.path_or_buf,
+ self.mode,
+ encoding=self.encoding,
+ compression=self.compression,
+ )
f.write(buf)
close = True
if close:
@@ -191,15 +231,17 @@ def _save_header(self):
header = self.header
encoded_labels = []
- has_aliases = isinstance(header, (tuple, list, np.ndarray,
- ABCIndexClass))
+ has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass))
if not (has_aliases or self.header):
return
if has_aliases:
if len(header) != len(cols):
- raise ValueError(('Writing {ncols} cols but got {nalias} '
- 'aliases'.format(ncols=len(cols),
- nalias=len(header))))
+ raise ValueError(
+ (
+ "Writing {ncols} cols but got {nalias} "
+ "aliases".format(ncols=len(cols), nalias=len(header))
+ )
+ )
else:
write_cols = header
else:
@@ -213,16 +255,17 @@ def _save_header(self):
index_label = []
for i, name in enumerate(obj.index.names):
if name is None:
- name = ''
+ name = ""
index_label.append(name)
else:
index_label = obj.index.name
if index_label is None:
- index_label = ['']
+ index_label = [""]
else:
index_label = [index_label]
- elif not isinstance(index_label,
- (list, tuple, np.ndarray, ABCIndexClass)):
+ elif not isinstance(
+ index_label, (list, tuple, np.ndarray, ABCIndexClass)
+ ):
# given a string for a DF with Index
index_label = [index_label]
@@ -249,7 +292,7 @@ def _save_header(self):
col_line.append(columns.names[i])
if isinstance(index_label, list) and len(index_label) > 1:
- col_line.extend([''] * (len(index_label) - 1))
+ col_line.extend([""] * (len(index_label) - 1))
col_line.extend(columns._get_level_values(i))
@@ -258,8 +301,8 @@ def _save_header(self):
# Write out the index line if it's not empty.
# Otherwise, we will print out an extraneous
# blank line between the mi and the data rows.
- if encoded_labels and set(encoded_labels) != {''}:
- encoded_labels.extend([''] * len(columns))
+ if encoded_labels and set(encoded_labels) != {""}:
+ encoded_labels.extend([""] * len(columns))
writer.writerow(encoded_labels)
def _save(self):
@@ -288,21 +331,26 @@ def _save_chunk(self, start_i, end_i):
slicer = slice(start_i, end_i)
for i in range(len(self.blocks)):
b = self.blocks[i]
- d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
- float_format=self.float_format,
- decimal=self.decimal,
- date_format=self.date_format,
- quoting=self.quoting)
+ d = b.to_native_types(
+ slicer=slicer,
+ na_rep=self.na_rep,
+ float_format=self.float_format,
+ decimal=self.decimal,
+ date_format=self.date_format,
+ quoting=self.quoting,
+ )
for col_loc, col in zip(b.mgr_locs, d):
# self.data is a preallocated list
self.data[col_loc] = col
- ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
- float_format=self.float_format,
- decimal=self.decimal,
- date_format=self.date_format,
- quoting=self.quoting)
+ ix = data_index.to_native_types(
+ slicer=slicer,
+ na_rep=self.na_rep,
+ float_format=self.float_format,
+ decimal=self.decimal,
+ date_format=self.date_format,
+ quoting=self.quoting,
+ )
- libwriters.write_csv_rows(self.data, ix, self.nlevels,
- self.cols, self.writer)
+ libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer)
diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py
index 66a00bf9ab054..012d2d9358241 100644
--- a/pandas/io/formats/excel.py
+++ b/pandas/io/formats/excel.py
@@ -21,11 +21,10 @@
class ExcelCell:
- __fields__ = ('row', 'col', 'val', 'style', 'mergestart', 'mergeend')
+ __fields__ = ("row", "col", "val", "style", "mergestart", "mergeend")
__slots__ = __fields__
- def __init__(self, row, col, val, style=None, mergestart=None,
- mergeend=None):
+ def __init__(self, row, col, val, style=None, mergestart=None, mergeend=None):
self.row = row
self.col = col
self.val = val
@@ -50,6 +49,7 @@ class CSSToExcelConverter:
CSS declarations understood to be the containing scope for the
CSS processed by :meth:`__call__`.
"""
+
# NB: Most of the methods here could be classmethods, as only __init__
# and __call__ make use of instance attributes. We leave them as
# instancemethods so that users can easily experiment with extensions
@@ -84,11 +84,11 @@ def __call__(self, declarations_str):
def build_xlstyle(self, props):
out = {
- 'alignment': self.build_alignment(props),
- 'border': self.build_border(props),
- 'fill': self.build_fill(props),
- 'font': self.build_font(props),
- 'number_format': self.build_number_format(props),
+ "alignment": self.build_alignment(props),
+ "border": self.build_border(props),
+ "fill": self.build_fill(props),
+ "font": self.build_font(props),
+ "number_format": self.build_number_format(props),
}
# TODO: handle cell width and height: needs support in pandas.io.excel
@@ -106,33 +106,40 @@ def remove_none(d):
return out
VERTICAL_MAP = {
- 'top': 'top',
- 'text-top': 'top',
- 'middle': 'center',
- 'baseline': 'bottom',
- 'bottom': 'bottom',
- 'text-bottom': 'bottom',
+ "top": "top",
+ "text-top": "top",
+ "middle": "center",
+ "baseline": "bottom",
+ "bottom": "bottom",
+ "text-bottom": "bottom",
# OpenXML also has 'justify', 'distributed'
}
def build_alignment(self, props):
# TODO: text-indent, padding-left -> alignment.indent
- return {'horizontal': props.get('text-align'),
- 'vertical': self.VERTICAL_MAP.get(props.get('vertical-align')),
- 'wrap_text': (None if props.get('white-space') is None else
- props['white-space'] not in
- ('nowrap', 'pre', 'pre-line'))
- }
+ return {
+ "horizontal": props.get("text-align"),
+ "vertical": self.VERTICAL_MAP.get(props.get("vertical-align")),
+ "wrap_text": (
+ None
+ if props.get("white-space") is None
+ else props["white-space"] not in ("nowrap", "pre", "pre-line")
+ ),
+ }
def build_border(self, props):
- return {side: {
- 'style': self._border_style(props.get('border-{side}-style'
- .format(side=side)),
- props.get('border-{side}-width'
- .format(side=side))),
- 'color': self.color_to_excel(
- props.get('border-{side}-color'.format(side=side))),
- } for side in ['top', 'right', 'bottom', 'left']}
+ return {
+ side: {
+ "style": self._border_style(
+ props.get("border-{side}-style".format(side=side)),
+ props.get("border-{side}-width".format(side=side)),
+ ),
+ "color": self.color_to_excel(
+ props.get("border-{side}-color".format(side=side))
+ ),
+ }
+ for side in ["top", "right", "bottom", "left"]
+ }
def _border_style(self, style, width):
# convert styles and widths to openxml, one of:
@@ -151,61 +158,70 @@ def _border_style(self, style, width):
# 'thin'
if width is None and style is None:
return None
- if style == 'none' or style == 'hidden':
+ if style == "none" or style == "hidden":
return None
if width is None:
- width = '2pt'
+ width = "2pt"
width = float(width[:-2])
if width < 1e-5:
return None
elif width < 1.3:
- width_name = 'thin'
+ width_name = "thin"
elif width < 2.8:
- width_name = 'medium'
+ width_name = "medium"
else:
- width_name = 'thick'
+ width_name = "thick"
- if style in (None, 'groove', 'ridge', 'inset', 'outset'):
+ if style in (None, "groove", "ridge", "inset", "outset"):
# not handled
- style = 'solid'
+ style = "solid"
- if style == 'double':
- return 'double'
- if style == 'solid':
+ if style == "double":
+ return "double"
+ if style == "solid":
return width_name
- if style == 'dotted':
- if width_name in ('hair', 'thin'):
- return 'dotted'
- return 'mediumDashDotDot'
- if style == 'dashed':
- if width_name in ('hair', 'thin'):
- return 'dashed'
- return 'mediumDashed'
+ if style == "dotted":
+ if width_name in ("hair", "thin"):
+ return "dotted"
+ return "mediumDashDotDot"
+ if style == "dashed":
+ if width_name in ("hair", "thin"):
+ return "dashed"
+ return "mediumDashed"
def build_fill(self, props):
# TODO: perhaps allow for special properties
# -excel-pattern-bgcolor and -excel-pattern-type
- fill_color = props.get('background-color')
- if fill_color not in (None, 'transparent', 'none'):
- return {
- 'fgColor': self.color_to_excel(fill_color),
- 'patternType': 'solid',
- }
-
- BOLD_MAP = {'bold': True, 'bolder': True, '600': True, '700': True,
- '800': True, '900': True,
- 'normal': False, 'lighter': False, '100': False, '200': False,
- '300': False, '400': False, '500': False}
- ITALIC_MAP = {'normal': False, 'italic': True, 'oblique': True}
+ fill_color = props.get("background-color")
+ if fill_color not in (None, "transparent", "none"):
+ return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"}
+
+ BOLD_MAP = {
+ "bold": True,
+ "bolder": True,
+ "600": True,
+ "700": True,
+ "800": True,
+ "900": True,
+ "normal": False,
+ "lighter": False,
+ "100": False,
+ "200": False,
+ "300": False,
+ "400": False,
+ "500": False,
+ }
+ ITALIC_MAP = {"normal": False, "italic": True, "oblique": True}
def build_font(self, props):
- size = props.get('font-size')
+ size = props.get("font-size")
if size is not None:
- assert size.endswith('pt')
+ assert size.endswith("pt")
size = float(size[:-2])
- font_names_tmp = re.findall(r'''(?x)
+ font_names_tmp = re.findall(
+ r"""(?x)
(
"(?:[^"]|\\")+"
|
@@ -213,13 +229,15 @@ def build_font(self, props):
|
[^'",]+
)(?=,|\s*$)
- ''', props.get('font-family', ''))
+ """,
+ props.get("font-family", ""),
+ )
font_names = []
for name in font_names_tmp:
if name[:1] == '"':
name = name[1:-1].replace('\\"', '"')
- elif name[:1] == '\'':
- name = name[1:-1].replace('\\\'', '\'')
+ elif name[:1] == "'":
+ name = name[1:-1].replace("\\'", "'")
else:
name = name.strip()
if name:
@@ -227,40 +245,40 @@ def build_font(self, props):
family = None
for name in font_names:
- if name == 'serif':
+ if name == "serif":
family = 1 # roman
break
- elif name == 'sans-serif':
+ elif name == "sans-serif":
family = 2 # swiss
break
- elif name == 'cursive':
+ elif name == "cursive":
family = 4 # script
break
- elif name == 'fantasy':
+ elif name == "fantasy":
family = 5 # decorative
break
- decoration = props.get('text-decoration')
+ decoration = props.get("text-decoration")
if decoration is not None:
decoration = decoration.split()
else:
decoration = ()
return {
- 'name': font_names[0] if font_names else None,
- 'family': family,
- 'size': size,
- 'bold': self.BOLD_MAP.get(props.get('font-weight')),
- 'italic': self.ITALIC_MAP.get(props.get('font-style')),
- 'underline': ('single' if
- 'underline' in decoration
- else None),
- 'strike': ('line-through' in decoration) or None,
- 'color': self.color_to_excel(props.get('color')),
+ "name": font_names[0] if font_names else None,
+ "family": family,
+ "size": size,
+ "bold": self.BOLD_MAP.get(props.get("font-weight")),
+ "italic": self.ITALIC_MAP.get(props.get("font-style")),
+ "underline": ("single" if "underline" in decoration else None),
+ "strike": ("line-through" in decoration) or None,
+ "color": self.color_to_excel(props.get("color")),
# shadow if nonzero digit before shadow color
- 'shadow': (bool(re.search('^[^#(]*[1-9]',
- props['text-shadow']))
- if 'text-shadow' in props else None),
+ "shadow": (
+ bool(re.search("^[^#(]*[1-9]", props["text-shadow"]))
+ if "text-shadow" in props
+ else None
+ ),
# 'vertAlign':,
# 'charset': ,
# 'scheme': ,
@@ -269,43 +287,42 @@ def build_font(self, props):
}
NAMED_COLORS = {
- 'maroon': '800000',
- 'brown': 'A52A2A',
- 'red': 'FF0000',
- 'pink': 'FFC0CB',
- 'orange': 'FFA500',
- 'yellow': 'FFFF00',
- 'olive': '808000',
- 'green': '008000',
- 'purple': '800080',
- 'fuchsia': 'FF00FF',
- 'lime': '00FF00',
- 'teal': '008080',
- 'aqua': '00FFFF',
- 'blue': '0000FF',
- 'navy': '000080',
- 'black': '000000',
- 'gray': '808080',
- 'grey': '808080',
- 'silver': 'C0C0C0',
- 'white': 'FFFFFF',
+ "maroon": "800000",
+ "brown": "A52A2A",
+ "red": "FF0000",
+ "pink": "FFC0CB",
+ "orange": "FFA500",
+ "yellow": "FFFF00",
+ "olive": "808000",
+ "green": "008000",
+ "purple": "800080",
+ "fuchsia": "FF00FF",
+ "lime": "00FF00",
+ "teal": "008080",
+ "aqua": "00FFFF",
+ "blue": "0000FF",
+ "navy": "000080",
+ "black": "000000",
+ "gray": "808080",
+ "grey": "808080",
+ "silver": "C0C0C0",
+ "white": "FFFFFF",
}
def color_to_excel(self, val):
if val is None:
return None
- if val.startswith('#') and len(val) == 7:
+ if val.startswith("#") and len(val) == 7:
return val[1:].upper()
- if val.startswith('#') and len(val) == 4:
+ if val.startswith("#") and len(val) == 4:
return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper()
try:
return self.NAMED_COLORS[val]
except KeyError:
- warnings.warn('Unhandled color format: {val!r}'.format(val=val),
- CSSWarning)
+ warnings.warn("Unhandled color format: {val!r}".format(val=val), CSSWarning)
def build_number_format(self, props):
- return {'format_code': props.get('number-format')}
+ return {"format_code": props.get("number-format")}
class ExcelFormatter:
@@ -341,15 +358,25 @@ class ExcelFormatter:
This is only called for body cells.
"""
- max_rows = 2**20
- max_cols = 2**14
-
- def __init__(self, df, na_rep='', float_format=None, cols=None,
- header=True, index=True, index_label=None, merge_cells=False,
- inf_rep='inf', style_converter=None):
+ max_rows = 2 ** 20
+ max_cols = 2 ** 14
+
+ def __init__(
+ self,
+ df,
+ na_rep="",
+ float_format=None,
+ cols=None,
+ header=True,
+ index=True,
+ index_label=None,
+ merge_cells=False,
+ inf_rep="inf",
+ style_converter=None,
+ ):
self.rowcounter = 0
self.na_rep = na_rep
- if hasattr(df, 'render'):
+ if hasattr(df, "render"):
self.styler = df
df = df.data
if style_converter is None:
@@ -362,8 +389,7 @@ def __init__(self, df, na_rep='', float_format=None, cols=None,
# all missing, raise
if not len(Index(cols) & df.columns):
- raise KeyError(
- "passes columns are not ALL present dataframe")
+ raise KeyError("passes columns are not ALL present dataframe")
# deprecatedin gh-17295
# 1 missing is ok (for now)
@@ -371,7 +397,8 @@ def __init__(self, df, na_rep='', float_format=None, cols=None,
warnings.warn(
"Not all names specified in 'columns' are found; "
"this will raise a KeyError in the future",
- FutureWarning)
+ FutureWarning,
+ )
self.df = df.reindex(columns=cols)
self.columns = self.df.columns
@@ -384,13 +411,16 @@ def __init__(self, df, na_rep='', float_format=None, cols=None,
@property
def header_style(self):
- return {"font": {"bold": True},
- "borders": {"top": "thin",
- "right": "thin",
- "bottom": "thin",
- "left": "thin"},
- "alignment": {"horizontal": "center",
- "vertical": "top"}}
+ return {
+ "font": {"bold": True},
+ "borders": {
+ "top": "thin",
+ "right": "thin",
+ "bottom": "thin",
+ "left": "thin",
+ },
+ "alignment": {"horizontal": "center", "vertical": "top"},
+ }
def _format_value(self, val):
if is_scalar(val) and missing.isna(val):
@@ -399,30 +429,35 @@ def _format_value(self, val):
if missing.isposinf_scalar(val):
val = self.inf_rep
elif missing.isneginf_scalar(val):
- val = '-{inf}'.format(inf=self.inf_rep)
+ val = "-{inf}".format(inf=self.inf_rep)
elif self.float_format is not None:
val = float(self.float_format % val)
- if getattr(val, 'tzinfo', None) is not None:
- raise ValueError('Excel does not support datetimes with '
- 'timezones. Please ensure that datetimes '
- 'are timezone unaware before writing to Excel.')
+ if getattr(val, "tzinfo", None) is not None:
+ raise ValueError(
+ "Excel does not support datetimes with "
+ "timezones. Please ensure that datetimes "
+ "are timezone unaware before writing to Excel."
+ )
return val
def _format_header_mi(self):
if self.columns.nlevels > 1:
if not self.index:
- raise NotImplementedError("Writing to Excel with MultiIndex"
- " columns and no index "
- "('index'=False) is not yet "
- "implemented.")
+ raise NotImplementedError(
+ "Writing to Excel with MultiIndex"
+ " columns and no index "
+ "('index'=False) is not yet "
+ "implemented."
+ )
has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
if not (has_aliases or self.header):
return
columns = self.columns
- level_strs = columns.format(sparsify=self.merge_cells, adjoin=False,
- names=False)
+ level_strs = columns.format(
+ sparsify=self.merge_cells, adjoin=False, names=False
+ )
level_lengths = get_level_lengths(level_strs)
coloffset = 0
lnum = 0
@@ -436,17 +471,24 @@ def _format_header_mi(self):
name = columns.names[lnum]
yield ExcelCell(lnum, coloffset, name, self.header_style)
- for lnum, (spans, levels, level_codes) in enumerate(zip(
- level_lengths, columns.levels, columns.codes)):
+ for lnum, (spans, levels, level_codes) in enumerate(
+ zip(level_lengths, columns.levels, columns.codes)
+ ):
values = levels.take(level_codes)
for i in spans:
if spans[i] > 1:
- yield ExcelCell(lnum, coloffset + i + 1, values[i],
- self.header_style, lnum,
- coloffset + i + spans[i])
+ yield ExcelCell(
+ lnum,
+ coloffset + i + 1,
+ values[i],
+ self.header_style,
+ lnum,
+ coloffset + i + spans[i],
+ )
else:
- yield ExcelCell(lnum, coloffset + i + 1, values[i],
- self.header_style)
+ yield ExcelCell(
+ lnum, coloffset + i + 1, values[i], self.header_style
+ )
else:
# Format in legacy format with dots to indicate levels.
for i, values in enumerate(zip(*level_strs)):
@@ -468,15 +510,17 @@ def _format_header_regular(self):
colnames = self.columns
if has_aliases:
if len(self.header) != len(self.columns):
- raise ValueError('Writing {cols} cols but got {alias} '
- 'aliases'.format(cols=len(self.columns),
- alias=len(self.header)))
+ raise ValueError(
+ "Writing {cols} cols but got {alias} "
+ "aliases".format(cols=len(self.columns), alias=len(self.header))
+ )
else:
colnames = self.header
for colindex, colname in enumerate(colnames):
- yield ExcelCell(self.rowcounter, colindex + coloffset, colname,
- self.header_style)
+ yield ExcelCell(
+ self.rowcounter, colindex + coloffset, colname, self.header_style
+ )
def _format_header(self):
if isinstance(self.columns, ABCMultiIndex):
@@ -486,12 +530,14 @@ def _format_header(self):
gen2 = ()
if self.df.index.names:
- row = [x if x is not None else ''
- for x in self.df.index.names] + [''] * len(self.columns)
- if reduce(lambda x, y: x and y, map(lambda x: x != '', row)):
- gen2 = (ExcelCell(self.rowcounter, colindex, val,
- self.header_style)
- for colindex, val in enumerate(row))
+ row = [x if x is not None else "" for x in self.df.index.names] + [
+ ""
+ ] * len(self.columns)
+ if reduce(lambda x, y: x and y, map(lambda x: x != "", row)):
+ gen2 = (
+ ExcelCell(self.rowcounter, colindex, val, self.header_style)
+ for colindex, val in enumerate(row)
+ )
self.rowcounter += 1
return itertools.chain(gen, gen2)
@@ -511,9 +557,9 @@ def _format_regular_rows(self):
if self.index:
# check aliases
# if list only take first as this is not a MultiIndex
- if (self.index_label and
- isinstance(self.index_label, (list, tuple, np.ndarray,
- Index))):
+ if self.index_label and isinstance(
+ self.index_label, (list, tuple, np.ndarray, Index)
+ ):
index_label = self.index_label[0]
# if string good to go
elif self.index_label and isinstance(self.index_label, str):
@@ -525,8 +571,7 @@ def _format_regular_rows(self):
self.rowcounter += 1
if index_label and self.header is not False:
- yield ExcelCell(self.rowcounter - 1, 0, index_label,
- self.header_style)
+ yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style)
# write index_values
index_values = self.df.index
@@ -534,8 +579,7 @@ def _format_regular_rows(self):
index_values = self.df.index.to_timestamp()
for idx, idxval in enumerate(index_values):
- yield ExcelCell(self.rowcounter + idx, 0, idxval,
- self.header_style)
+ yield ExcelCell(self.rowcounter + idx, 0, idxval, self.header_style)
coloffset = 1
else:
@@ -554,9 +598,9 @@ def _format_hierarchical_rows(self):
if self.index:
index_labels = self.df.index.names
# check for aliases
- if (self.index_label and
- isinstance(self.index_label, (list, tuple, np.ndarray,
- Index))):
+ if self.index_label and isinstance(
+ self.index_label, (list, tuple, np.ndarray, Index)
+ ):
index_labels = self.index_label
# MultiIndex columns require an extra row
@@ -570,40 +614,52 @@ def _format_hierarchical_rows(self):
if com._any_not_none(*index_labels) and self.header is not False:
for cidx, name in enumerate(index_labels):
- yield ExcelCell(self.rowcounter - 1, cidx, name,
- self.header_style)
+ yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style)
if self.merge_cells:
# Format hierarchical rows as merged cells.
- level_strs = self.df.index.format(sparsify=True, adjoin=False,
- names=False)
+ level_strs = self.df.index.format(
+ sparsify=True, adjoin=False, names=False
+ )
level_lengths = get_level_lengths(level_strs)
- for spans, levels, level_codes in zip(level_lengths,
- self.df.index.levels,
- self.df.index.codes):
+ for spans, levels, level_codes in zip(
+ level_lengths, self.df.index.levels, self.df.index.codes
+ ):
- values = levels.take(level_codes,
- allow_fill=levels._can_hold_na,
- fill_value=True)
+ values = levels.take(
+ level_codes, allow_fill=levels._can_hold_na, fill_value=True
+ )
for i in spans:
if spans[i] > 1:
- yield ExcelCell(self.rowcounter + i, gcolidx,
- values[i], self.header_style,
- self.rowcounter + i + spans[i] - 1,
- gcolidx)
+ yield ExcelCell(
+ self.rowcounter + i,
+ gcolidx,
+ values[i],
+ self.header_style,
+ self.rowcounter + i + spans[i] - 1,
+ gcolidx,
+ )
else:
- yield ExcelCell(self.rowcounter + i, gcolidx,
- values[i], self.header_style)
+ yield ExcelCell(
+ self.rowcounter + i,
+ gcolidx,
+ values[i],
+ self.header_style,
+ )
gcolidx += 1
else:
# Format hierarchical rows with non-merged values.
for indexcolvals in zip(*self.df.index):
for idx, indexcolval in enumerate(indexcolvals):
- yield ExcelCell(self.rowcounter + idx, gcolidx,
- indexcolval, self.header_style)
+ yield ExcelCell(
+ self.rowcounter + idx,
+ gcolidx,
+ indexcolval,
+ self.header_style,
+ )
gcolidx += 1
for cell in self._generate_body(gcolidx):
@@ -623,18 +679,23 @@ def _generate_body(self, coloffset):
series = self.df.iloc[:, colidx]
for i, val in enumerate(series):
if styles is not None:
- xlstyle = self.style_converter(';'.join(styles[i, colidx]))
- yield ExcelCell(self.rowcounter + i, colidx + coloffset, val,
- xlstyle)
+ xlstyle = self.style_converter(";".join(styles[i, colidx]))
+ yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle)
def get_formatted_cells(self):
- for cell in itertools.chain(self._format_header(),
- self._format_body()):
+ for cell in itertools.chain(self._format_header(), self._format_body()):
cell.val = self._format_value(cell.val)
yield cell
- def write(self, writer, sheet_name='Sheet1', startrow=0,
- startcol=0, freeze_panes=None, engine=None):
+ def write(
+ self,
+ writer,
+ sheet_name="Sheet1",
+ startrow=0,
+ startcol=0,
+ freeze_panes=None,
+ engine=None,
+ ):
"""
writer : string or ExcelWriter object
File path or existing ExcelWriter
@@ -657,10 +718,11 @@ def write(self, writer, sheet_name='Sheet1', startrow=0,
num_rows, num_cols = self.df.shape
if num_rows > self.max_rows or num_cols > self.max_cols:
- raise ValueError("This sheet is too large! Your sheet size is: " +
- "{}, {} ".format(num_rows, num_cols) +
- "Max sheet size is: {}, {}".
- format(self.max_rows, self.max_cols))
+ raise ValueError(
+ "This sheet is too large! Your sheet size is: "
+ + "{}, {} ".format(num_rows, num_cols)
+ + "Max sheet size is: {}, {}".format(self.max_rows, self.max_cols)
+ )
if isinstance(writer, ExcelWriter):
need_save = False
@@ -669,8 +731,12 @@ def write(self, writer, sheet_name='Sheet1', startrow=0,
need_save = True
formatted_cells = self.get_formatted_cells()
- writer.write_cells(formatted_cells, sheet_name,
- startrow=startrow, startcol=startcol,
- freeze_panes=freeze_panes)
+ writer.write_cells(
+ formatted_cells,
+ sheet_name,
+ startrow=startrow,
+ startcol=startcol,
+ freeze_panes=freeze_panes,
+ )
if need_save:
writer.save()
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index 98c31fbeb78e6..c4e3dd1c755cf 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -17,12 +17,26 @@
from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT
from pandas.core.dtypes.common import (
- is_categorical_dtype, is_complex_dtype, is_datetime64_dtype,
- is_datetime64tz_dtype, is_extension_array_dtype, is_float, is_float_dtype,
- is_integer, is_integer_dtype, is_list_like, is_numeric_dtype, is_scalar,
- is_timedelta64_dtype)
+ is_categorical_dtype,
+ is_complex_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_extension_array_dtype,
+ is_float,
+ is_float_dtype,
+ is_integer,
+ is_integer_dtype,
+ is_list_like,
+ is_numeric_dtype,
+ is_scalar,
+ is_timedelta64_dtype,
+)
from pandas.core.dtypes.generic import (
- ABCIndexClass, ABCMultiIndex, ABCSeries, ABCSparseArray)
+ ABCIndexClass,
+ ABCMultiIndex,
+ ABCSeries,
+ ABCSparseArray,
+)
from pandas.core.dtypes.missing import isna, notna
from pandas.core.base import PandasObject
@@ -92,9 +106,19 @@
.. versionadded:: 0.18.0
"""
-_VALID_JUSTIFY_PARAMETERS = ("left", "right", "center", "justify",
- "justify-all", "start", "end", "inherit",
- "match-parent", "initial", "unset")
+_VALID_JUSTIFY_PARAMETERS = (
+ "left",
+ "right",
+ "center",
+ "justify",
+ "justify-all",
+ "start",
+ "end",
+ "inherit",
+ "match-parent",
+ "initial",
+ "unset",
+)
return_docstring = """
Returns
@@ -105,9 +129,7 @@
class CategoricalFormatter:
-
- def __init__(self, categorical, buf=None, length=True, na_rep='NaN',
- footer=True):
+ def __init__(self, categorical, buf=None, length=True, na_rep="NaN", footer=True):
self.categorical = categorical
self.buf = buf if buf is not None else StringIO("")
self.na_rep = na_rep
@@ -115,25 +137,29 @@ def __init__(self, categorical, buf=None, length=True, na_rep='NaN',
self.footer = footer
def _get_footer(self):
- footer = ''
+ footer = ""
if self.length:
if footer:
- footer += ', '
+ footer += ", "
footer += "Length: {length}".format(length=len(self.categorical))
level_info = self.categorical._repr_categories_info()
# Levels are added in a newline
if footer:
- footer += '\n'
+ footer += "\n"
footer += level_info
return str(footer)
def _get_formatted_values(self):
- return format_array(self.categorical._internal_get_values(), None,
- float_format=None, na_rep=self.na_rep)
+ return format_array(
+ self.categorical._internal_get_values(),
+ None,
+ float_format=None,
+ na_rep=self.na_rep,
+ )
def to_string(self):
categorical = self.categorical
@@ -142,27 +168,37 @@ def to_string(self):
if self.footer:
return self._get_footer()
else:
- return ''
+ return ""
fmt_values = self._get_formatted_values()
- result = ['{i}'.format(i=i) for i in fmt_values]
+ result = ["{i}".format(i=i) for i in fmt_values]
result = [i.strip() for i in result]
- result = ', '.join(result)
- result = ['[' + result + ']']
+ result = ", ".join(result)
+ result = ["[" + result + "]"]
if self.footer:
footer = self._get_footer()
if footer:
result.append(footer)
- return str('\n'.join(result))
+ return str("\n".join(result))
class SeriesFormatter:
-
- def __init__(self, series, buf=None, length=True, header=True, index=True,
- na_rep='NaN', name=False, float_format=None, dtype=True,
- max_rows=None, min_rows=None):
+ def __init__(
+ self,
+ series,
+ buf=None,
+ length=True,
+ header=True,
+ index=True,
+ na_rep="NaN",
+ name=False,
+ float_format=None,
+ dtype=True,
+ max_rows=None,
+ min_rows=None,
+ ):
self.series = series
self.buf = buf if buf is not None else StringIO()
self.name = name
@@ -183,6 +219,7 @@ def __init__(self, series, buf=None, length=True, header=True, index=True,
def _chk_truncate(self):
from pandas.core.reshape.concat import concat
+
min_rows = self.min_rows
max_rows = self.max_rows
# truncation determined by max_rows, actual truncated number of rows
@@ -199,8 +236,7 @@ def _chk_truncate(self):
series = series.iloc[:max_rows]
else:
row_num = max_rows // 2
- series = concat((series.iloc[:row_num],
- series.iloc[-row_num:]))
+ series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
self.tr_row_num = row_num
else:
self.tr_row_num = None
@@ -209,32 +245,31 @@ def _chk_truncate(self):
def _get_footer(self):
name = self.series.name
- footer = ''
+ footer = ""
- if getattr(self.series.index, 'freq', None) is not None:
- footer += 'Freq: {freq}'.format(freq=self.series.index.freqstr)
+ if getattr(self.series.index, "freq", None) is not None:
+ footer += "Freq: {freq}".format(freq=self.series.index.freqstr)
if self.name is not False and name is not None:
if footer:
- footer += ', '
+ footer += ", "
- series_name = pprint_thing(name,
- escape_chars=('\t', '\r', '\n'))
- footer += (("Name: {sname}".format(sname=series_name))
- if name is not None else "")
+ series_name = pprint_thing(name, escape_chars=("\t", "\r", "\n"))
+ footer += (
+ ("Name: {sname}".format(sname=series_name)) if name is not None else ""
+ )
- if (self.length is True or
- (self.length == 'truncate' and self.truncate_v)):
+ if self.length is True or (self.length == "truncate" and self.truncate_v):
if footer:
- footer += ', '
- footer += 'Length: {length}'.format(length=len(self.series))
+ footer += ", "
+ footer += "Length: {length}".format(length=len(self.series))
if self.dtype is not False and self.dtype is not None:
- name = getattr(self.tr_series.dtype, 'name', None)
+ name = getattr(self.tr_series.dtype, "name", None)
if name:
if footer:
- footer += ', '
- footer += 'dtype: {typ}'.format(typ=pprint_thing(name))
+ footer += ", "
+ footer += "dtype: {typ}".format(typ=pprint_thing(name))
# level infos are added to the end and in a new line, like it is done
# for Categoricals
@@ -260,8 +295,9 @@ def _get_formatted_index(self):
def _get_formatted_values(self):
values_to_format = self.tr_series._formatting_values()
- return format_array(values_to_format, None,
- float_format=self.float_format, na_rep=self.na_rep)
+ return format_array(
+ values_to_format, None, float_format=self.float_format, na_rep=self.na_rep
+ )
def to_string(self):
series = self.tr_series
@@ -269,7 +305,8 @@ def to_string(self):
if len(series) == 0:
return "{name}([], {footer})".format(
- name=self.series.__class__.__name__, footer=footer)
+ name=self.series.__class__.__name__, footer=footer
+ )
fmt_index, have_header = self._get_formatted_index()
fmt_values = self._get_formatted_values()
@@ -279,14 +316,14 @@ def to_string(self):
row_num = self.tr_row_num
width = self.adj.len(fmt_values[row_num - 1])
if width > 3:
- dot_str = '...'
+ dot_str = "..."
else:
- dot_str = '..'
+ dot_str = ".."
# Series uses mode=center because it has single value columns
# DataFrame uses mode=left
- dot_str = self.adj.justify([dot_str], width, mode='center')[0]
+ dot_str = self.adj.justify([dot_str], width, mode="center")[0]
fmt_values.insert(row_num + n_header_rows, dot_str)
- fmt_index.insert(row_num + 1, '')
+ fmt_index.insert(row_num + 1, "")
if self.index:
result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values])
@@ -294,32 +331,29 @@ def to_string(self):
result = self.adj.adjoin(3, fmt_values)
if self.header and have_header:
- result = fmt_index[0] + '\n' + result
+ result = fmt_index[0] + "\n" + result
if footer:
- result += '\n' + footer
+ result += "\n" + footer
- return str(''.join(result))
+ return str("".join(result))
class TextAdjustment:
-
def __init__(self):
self.encoding = get_option("display.encoding")
def len(self, text):
return len(text)
- def justify(self, texts, max_len, mode='right'):
+ def justify(self, texts, max_len, mode="right"):
return justify(texts, max_len, mode=mode)
def adjoin(self, space, *lists, **kwargs):
- return adjoin(space, *lists, strlen=self.len,
- justfunc=self.justify, **kwargs)
+ return adjoin(space, *lists, strlen=self.len, justfunc=self.justify, **kwargs)
class EastAsianTextAdjustment(TextAdjustment):
-
def __init__(self):
super().__init__()
if get_option("display.unicode.ambiguous_as_wide"):
@@ -330,7 +364,7 @@ def __init__(self):
# Definition of East Asian Width
# http://unicode.org/reports/tr11/
# Ambiguous width can be changed by option
- self._EAW_MAP = {'Na': 1, 'N': 1, 'W': 2, 'F': 2, 'H': 1}
+ self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1}
def len(self, text):
"""
@@ -339,17 +373,18 @@ def len(self, text):
if not isinstance(text, str):
return len(text)
- return sum(self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width)
- for c in text)
+ return sum(
+ self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text
+ )
- def justify(self, texts, max_len, mode='right'):
+ def justify(self, texts, max_len, mode="right"):
# re-calculate padding space per str considering East Asian Width
def _get_pad(t):
return max_len - self.len(t) + len(t)
- if mode == 'left':
+ if mode == "left":
return [x.ljust(_get_pad(x)) for x in texts]
- elif mode == 'center':
+ elif mode == "center":
return [x.center(_get_pad(x)) for x in texts]
else:
return [x.rjust(_get_pad(x)) for x in texts]
@@ -370,8 +405,9 @@ class TableFormatter:
@property
def should_show_dimensions(self):
- return (self.show_dimensions is True or
- (self.show_dimensions == 'truncate' and self.is_truncated))
+ return self.show_dimensions is True or (
+ self.show_dimensions == "truncate" and self.is_truncated
+ )
def _get_formatter(self, i):
if isinstance(self.formatters, (list, tuple)):
@@ -395,15 +431,33 @@ class DataFrameFormatter(TableFormatter):
"""
- __doc__ = __doc__ if __doc__ else ''
+ __doc__ = __doc__ if __doc__ else ""
__doc__ += common_docstring + return_docstring
- def __init__(self, frame, buf=None, columns=None, col_space=None,
- header=True, index=True, na_rep='NaN', formatters=None,
- justify=None, float_format=None, sparsify=None,
- index_names=True, line_width=None, max_rows=None,
- min_rows=None, max_cols=None, show_dimensions=False,
- decimal='.', table_id=None, render_links=False, **kwds):
+ def __init__(
+ self,
+ frame,
+ buf=None,
+ columns=None,
+ col_space=None,
+ header=True,
+ index=True,
+ na_rep="NaN",
+ formatters=None,
+ justify=None,
+ float_format=None,
+ sparsify=None,
+ index_names=True,
+ line_width=None,
+ max_rows=None,
+ min_rows=None,
+ max_cols=None,
+ show_dimensions=False,
+ decimal=".",
+ table_id=None,
+ render_links=False,
+ **kwds
+ ):
self.frame = frame
if buf is not None:
self.buf = _expand_user(_stringify_path(buf))
@@ -427,8 +481,7 @@ def __init__(self, frame, buf=None, columns=None, col_space=None,
self.max_rows = max_rows
self.min_rows = min_rows
self.max_cols = max_cols
- self.max_rows_displayed = min(max_rows or len(self.frame),
- len(self.frame))
+ self.max_rows_displayed = min(max_rows or len(self.frame), len(self.frame))
self.show_dimensions = show_dimensions
self.table_id = table_id
self.render_links = render_links
@@ -469,8 +522,7 @@ def _chk_truncate(self):
prompt_row = 1
if self.show_dimensions:
show_dimension_rows = 3
- n_add_rows = (self.header + dot_row + show_dimension_rows +
- prompt_row)
+ n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row
# rows available to fill with actual data
max_rows_adj = self.h - n_add_rows
self.max_rows_adj = max_rows_adj
@@ -482,13 +534,13 @@ def _chk_truncate(self):
if max_rows == 0 and len(self.frame) > h:
max_rows = h
- if not hasattr(self, 'max_rows_adj'):
+ if not hasattr(self, "max_rows_adj"):
if max_rows:
if (len(self.frame) > max_rows) and self.min_rows:
# if truncated, set max_rows showed to min_rows
max_rows = min(self.min_rows, max_rows)
self.max_rows_adj = max_rows
- if not hasattr(self, 'max_cols_adj'):
+ if not hasattr(self, "max_cols_adj"):
self.max_cols_adj = max_cols
max_cols_adj = self.max_cols_adj
@@ -505,9 +557,10 @@ def _chk_truncate(self):
frame = frame.iloc[:, :max_cols]
col_num = max_cols
else:
- col_num = (max_cols_adj // 2)
- frame = concat((frame.iloc[:, :col_num],
- frame.iloc[:, -col_num:]), axis=1)
+ col_num = max_cols_adj // 2
+ frame = concat(
+ (frame.iloc[:, :col_num], frame.iloc[:, -col_num:]), axis=1
+ )
self.tr_col_num = col_num
if truncate_v:
if max_rows_adj == 1:
@@ -515,8 +568,7 @@ def _chk_truncate(self):
frame = frame.iloc[:max_rows, :]
else:
row_num = max_rows_adj // 2
- frame = concat((frame.iloc[:row_num, :],
- frame.iloc[-row_num:, :]))
+ frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :]))
self.tr_row_num = row_num
else:
self.tr_row_num = None
@@ -539,37 +591,44 @@ def _to_str_columns(self):
stringified = []
for i, c in enumerate(frame):
fmt_values = self._format_col(i)
- fmt_values = _make_fixed_width(fmt_values, self.justify,
- minimum=(self.col_space or 0),
- adj=self.adj)
+ fmt_values = _make_fixed_width(
+ fmt_values,
+ self.justify,
+ minimum=(self.col_space or 0),
+ adj=self.adj,
+ )
stringified.append(fmt_values)
else:
if is_list_like(self.header):
if len(self.header) != len(self.columns):
- raise ValueError(('Writing {ncols} cols but got {nalias} '
- 'aliases'
- .format(ncols=len(self.columns),
- nalias=len(self.header))))
+ raise ValueError(
+ (
+ "Writing {ncols} cols but got {nalias} "
+ "aliases".format(
+ ncols=len(self.columns), nalias=len(self.header)
+ )
+ )
+ )
str_columns = [[label] for label in self.header]
else:
str_columns = self._get_formatted_column_labels(frame)
if self.show_row_idx_names:
for x in str_columns:
- x.append('')
+ x.append("")
stringified = []
for i, c in enumerate(frame):
cheader = str_columns[i]
- header_colwidth = max(self.col_space or 0,
- *(self.adj.len(x) for x in cheader))
+ header_colwidth = max(
+ self.col_space or 0, *(self.adj.len(x) for x in cheader)
+ )
fmt_values = self._format_col(i)
- fmt_values = _make_fixed_width(fmt_values, self.justify,
- minimum=header_colwidth,
- adj=self.adj)
+ fmt_values = _make_fixed_width(
+ fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
+ )
- max_len = max(max(self.adj.len(x) for x in fmt_values),
- header_colwidth)
+ max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth)
cheader = self.adj.justify(cheader, max_len, mode=self.justify)
stringified.append(cheader + fmt_values)
@@ -583,7 +642,7 @@ def _to_str_columns(self):
if truncate_h:
col_num = self.tr_col_num
- strcols.insert(self.tr_col_num + 1, [' ...'] * (len(str_index)))
+ strcols.insert(self.tr_col_num + 1, [" ..."] * (len(str_index)))
if truncate_v:
n_header_rows = len(str_index) - len(frame)
row_num = self.tr_row_num
@@ -594,17 +653,17 @@ def _to_str_columns(self):
if truncate_h:
is_dot_col = ix == col_num + 1
if cwidth > 3 or is_dot_col:
- my_str = '...'
+ my_str = "..."
else:
- my_str = '..'
+ my_str = ".."
if ix == 0:
- dot_mode = 'left'
+ dot_mode = "left"
elif is_dot_col:
cwidth = 4
- dot_mode = 'right'
+ dot_mode = "right"
else:
- dot_mode = 'right'
+ dot_mode = "right"
dot_str = self.adj.justify([my_str], cwidth, mode=dot_mode)[0]
strcols[ix].insert(row_num + n_header_rows, dot_str)
return strcols
@@ -618,10 +677,11 @@ def to_string(self):
frame = self.frame
if len(frame.columns) == 0 or len(frame.index) == 0:
- info_line = ('Empty {name}\nColumns: {col}\nIndex: {idx}'
- .format(name=type(self.frame).__name__,
- col=pprint_thing(frame.columns),
- idx=pprint_thing(frame.index)))
+ info_line = "Empty {name}\nColumns: {col}\nIndex: {idx}".format(
+ name=type(self.frame).__name__,
+ col=pprint_thing(frame.columns),
+ idx=pprint_thing(frame.index),
+ )
text = info_line
else:
@@ -629,27 +689,27 @@ def to_string(self):
if self.line_width is None: # no need to wrap around just print
# the whole frame
text = self.adj.adjoin(1, *strcols)
- elif (not isinstance(self.max_cols, int) or
- self.max_cols > 0): # need to wrap around
+ elif (
+ not isinstance(self.max_cols, int) or self.max_cols > 0
+ ): # need to wrap around
text = self._join_multiline(*strcols)
else: # max_cols == 0. Try to fit frame to terminal
- text = self.adj.adjoin(1, *strcols).split('\n')
+ text = self.adj.adjoin(1, *strcols).split("\n")
max_len = Series(text).str.len().max()
# plus truncate dot col
dif = max_len - self.w
# '+ 1' to avoid too wide repr (GH PR #17023)
adj_dif = dif + 1
- col_lens = Series([Series(ele).apply(len).max()
- for ele in strcols])
+ col_lens = Series([Series(ele).apply(len).max() for ele in strcols])
n_cols = len(col_lens)
counter = 0
while adj_dif > 0 and n_cols > 1:
counter += 1
- mid = int(round(n_cols / 2.))
+ mid = int(round(n_cols / 2.0))
mid_ix = col_lens.index[mid]
col_len = col_lens[mid_ix]
# adjoin adds one
- adj_dif -= (col_len + 1)
+ adj_dif -= col_len + 1
col_lens = col_lens.drop(mid_ix)
n_cols = len(col_lens)
# subtract index column
@@ -666,8 +726,11 @@ def to_string(self):
self.buf.writelines(text)
if self.should_show_dimensions:
- self.buf.write("\n\n[{nrows} rows x {ncols} columns]"
- .format(nrows=len(frame), ncols=len(frame.columns)))
+ self.buf.write(
+ "\n\n[{nrows} rows x {ncols} columns]".format(
+ nrows=len(frame), ncols=len(frame.columns)
+ )
+ )
def _join_multiline(self, *strcols):
lwidth = self.line_width
@@ -675,11 +738,12 @@ def _join_multiline(self, *strcols):
strcols = list(strcols)
if self.index:
idx = strcols.pop(0)
- lwidth -= np.array([self.adj.len(x)
- for x in idx]).max() + adjoin_width
+ lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width
- col_widths = [np.array([self.adj.len(x) for x in col]).max() if
- len(col) > 0 else 0 for col in strcols]
+ col_widths = [
+ np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0
+ for col in strcols
+ ]
col_bins = _binify(col_widths, lwidth)
nbins = len(col_bins)
@@ -696,46 +760,62 @@ def _join_multiline(self, *strcols):
row.insert(0, idx)
if nbins > 1:
if ed <= len(strcols) and i < nbins - 1:
- row.append([' \\'] + [' '] * (nrows - 1))
+ row.append([" \\"] + [" "] * (nrows - 1))
else:
- row.append([' '] * nrows)
+ row.append([" "] * nrows)
str_lst.append(self.adj.adjoin(adjoin_width, *row))
st = ed
- return '\n\n'.join(str_lst)
-
- def to_latex(self, column_format=None, longtable=False, encoding=None,
- multicolumn=False, multicolumn_format=None, multirow=False):
+ return "\n\n".join(str_lst)
+
+ def to_latex(
+ self,
+ column_format=None,
+ longtable=False,
+ encoding=None,
+ multicolumn=False,
+ multicolumn_format=None,
+ multirow=False,
+ ):
"""
Render a DataFrame to a LaTeX tabular/longtable environment output.
"""
from pandas.io.formats.latex import LatexFormatter
- latex_renderer = LatexFormatter(self, column_format=column_format,
- longtable=longtable,
- multicolumn=multicolumn,
- multicolumn_format=multicolumn_format,
- multirow=multirow)
+
+ latex_renderer = LatexFormatter(
+ self,
+ column_format=column_format,
+ longtable=longtable,
+ multicolumn=multicolumn,
+ multicolumn_format=multicolumn_format,
+ multirow=multirow,
+ )
if encoding is None:
- encoding = 'utf-8'
+ encoding = "utf-8"
- if hasattr(self.buf, 'write'):
+ if hasattr(self.buf, "write"):
latex_renderer.write_result(self.buf)
elif isinstance(self.buf, str):
import codecs
- with codecs.open(self.buf, 'w', encoding=encoding) as f:
+
+ with codecs.open(self.buf, "w", encoding=encoding) as f:
latex_renderer.write_result(f)
else:
- raise TypeError('buf is not a file name and it has no write '
- 'method')
+ raise TypeError("buf is not a file name and it has no write " "method")
def _format_col(self, i):
frame = self.tr_frame
formatter = self._get_formatter(i)
values_to_format = frame.iloc[:, i]._formatting_values()
- return format_array(values_to_format, formatter,
- float_format=self.float_format, na_rep=self.na_rep,
- space=self.col_space, decimal=self.decimal)
+ return format_array(
+ values_to_format,
+ formatter,
+ float_format=self.float_format,
+ na_rep=self.na_rep,
+ space=self.col_space,
+ decimal=self.decimal,
+ )
def to_html(self, classes=None, notebook=False, border=None):
"""
@@ -755,16 +835,16 @@ def to_html(self, classes=None, notebook=False, border=None):
.. versionadded:: 0.19.0
"""
from pandas.io.formats.html import HTMLFormatter, NotebookFormatter
+
Klass = NotebookFormatter if notebook else HTMLFormatter
html = Klass(self, classes=classes, border=border).render()
- if hasattr(self.buf, 'write'):
+ if hasattr(self.buf, "write"):
buffer_put_lines(self.buf, html)
elif isinstance(self.buf, str):
- with open(self.buf, 'w') as f:
+ with open(self.buf, "w") as f:
buffer_put_lines(f, html)
else:
- raise TypeError('buf is not a file name and it has no write '
- ' method')
+ raise TypeError("buf is not a file name and it has no write " " method")
def _get_formatted_column_labels(self, frame):
from pandas.core.index import _sparsify
@@ -781,13 +861,17 @@ def _get_formatted_column_labels(self, frame):
need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
def space_format(x, y):
- if (y not in self.formatters and
- need_leadsp[x] and not restrict_formatting):
- return ' ' + y
+ if (
+ y not in self.formatters
+ and need_leadsp[x]
+ and not restrict_formatting
+ ):
+ return " " + y
return y
- str_columns = list(zip(*[[space_format(x, y) for y in x]
- for x in fmt_columns]))
+ str_columns = list(
+ zip(*[[space_format(x, y) for y in x] for x in fmt_columns])
+ )
if self.sparsify and len(str_columns):
str_columns = _sparsify(str_columns)
@@ -796,10 +880,10 @@ def space_format(x, y):
fmt_columns = columns.format()
dtypes = self.frame.dtypes
need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
- str_columns = [[' ' + x if not self._get_formatter(i) and
- need_leadsp[x] else x]
- for i, (col, x) in enumerate(zip(columns,
- fmt_columns))]
+ str_columns = [
+ [" " + x if not self._get_formatter(i) and need_leadsp[x] else x]
+ for i, (col, x) in enumerate(zip(columns, fmt_columns))
+ ]
# self.str_columns = str_columns
return str_columns
@@ -813,43 +897,45 @@ def has_column_names(self):
@property
def show_row_idx_names(self):
- return all((self.has_index_names,
- self.index,
- self.show_index_names))
+ return all((self.has_index_names, self.index, self.show_index_names))
@property
def show_col_idx_names(self):
- return all((self.has_column_names,
- self.show_index_names,
- self.header))
+ return all((self.has_column_names, self.show_index_names, self.header))
def _get_formatted_index(self, frame):
# Note: this is only used by to_string() and to_latex(), not by
# to_html().
index = frame.index
columns = frame.columns
- fmt = self._get_formatter('__index__')
+ fmt = self._get_formatter("__index__")
if isinstance(index, ABCMultiIndex):
fmt_index = index.format(
- sparsify=self.sparsify, adjoin=False,
- names=self.show_row_idx_names, formatter=fmt)
+ sparsify=self.sparsify,
+ adjoin=False,
+ names=self.show_row_idx_names,
+ formatter=fmt,
+ )
else:
- fmt_index = [index.format(
- name=self.show_row_idx_names, formatter=fmt)]
+ fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)]
- fmt_index = [tuple(_make_fixed_width(list(x), justify='left',
- minimum=(self.col_space or 0),
- adj=self.adj)) for x in fmt_index]
+ fmt_index = [
+ tuple(
+ _make_fixed_width(
+ list(x), justify="left", minimum=(self.col_space or 0), adj=self.adj
+ )
+ )
+ for x in fmt_index
+ ]
- adjoined = self.adj.adjoin(1, *fmt_index).split('\n')
+ adjoined = self.adj.adjoin(1, *fmt_index).split("\n")
# empty space for columns
if self.show_col_idx_names:
- col_header = ['{x}'.format(x=x)
- for x in self._get_column_name_list()]
+ col_header = ["{x}".format(x=x) for x in self._get_column_name_list()]
else:
- col_header = [''] * columns.nlevels
+ col_header = [""] * columns.nlevels
if self.header:
return col_header + adjoined
@@ -860,19 +946,27 @@ def _get_column_name_list(self):
names = []
columns = self.frame.columns
if isinstance(columns, ABCMultiIndex):
- names.extend('' if name is None else name
- for name in columns.names)
+ names.extend("" if name is None else name for name in columns.names)
else:
- names.append('' if columns.name is None else columns.name)
+ names.append("" if columns.name is None else columns.name)
return names
+
# ----------------------------------------------------------------------
# Array formatters
-def format_array(values, formatter, float_format=None, na_rep='NaN',
- digits=None, space=None, justify='right', decimal='.',
- leading_space=None):
+def format_array(
+ values,
+ formatter,
+ float_format=None,
+ na_rep="NaN",
+ digits=None,
+ space=None,
+ justify="right",
+ decimal=".",
+ leading_space=None,
+):
"""
Format an array for printing.
@@ -924,19 +1018,36 @@ def format_array(values, formatter, float_format=None, na_rep='NaN',
if digits is None:
digits = get_option("display.precision")
- fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep,
- float_format=float_format, formatter=formatter,
- space=space, justify=justify, decimal=decimal,
- leading_space=leading_space)
+ fmt_obj = fmt_klass(
+ values,
+ digits=digits,
+ na_rep=na_rep,
+ float_format=float_format,
+ formatter=formatter,
+ space=space,
+ justify=justify,
+ decimal=decimal,
+ leading_space=leading_space,
+ )
return fmt_obj.get_result()
class GenericArrayFormatter:
-
- def __init__(self, values, digits=7, formatter=None, na_rep='NaN',
- space=12, float_format=None, justify='right', decimal='.',
- quoting=None, fixed_width=True, leading_space=None):
+ def __init__(
+ self,
+ values,
+ digits=7,
+ formatter=None,
+ na_rep="NaN",
+ space=12,
+ float_format=None,
+ justify="right",
+ decimal=".",
+ quoting=None,
+ fixed_width=True,
+ leading_space=None,
+ ):
self.values = values
self.digits = digits
self.na_rep = na_rep
@@ -957,15 +1068,18 @@ def _format_strings(self):
if self.float_format is None:
float_format = get_option("display.float_format")
if float_format is None:
- fmt_str = ('{{x: .{prec:d}g}}'
- .format(prec=get_option("display.precision")))
+ fmt_str = "{{x: .{prec:d}g}}".format(
+ prec=get_option("display.precision")
+ )
float_format = lambda x: fmt_str.format(x=x)
else:
float_format = self.float_format
formatter = (
- self.formatter if self.formatter is not None else
- (lambda x: pprint_thing(x, escape_chars=('\t', '\r', '\n'))))
+ self.formatter
+ if self.formatter is not None
+ else (lambda x: pprint_thing(x, escape_chars=("\t", "\r", "\n")))
+ )
def _format(x):
if self.na_rep is not None and is_scalar(x) and isna(x):
@@ -973,18 +1087,18 @@ def _format(x):
# try block for np.isnat specifically
# determine na_rep if x is None or NaT-like
if x is None:
- return 'None'
+ return "None"
elif x is NaT or np.isnat(x):
- return 'NaT'
+ return "NaT"
except (TypeError, ValueError):
# np.isnat only handles datetime or timedelta objects
pass
return self.na_rep
elif isinstance(x, PandasObject):
- return '{x}'.format(x=x)
+ return "{x}".format(x=x)
else:
# object dtype
- return '{x}'.format(x=formatter(x))
+ return "{x}".format(x=formatter(x))
vals = self.values
if isinstance(vals, Index):
@@ -1000,16 +1114,16 @@ def _format(x):
fmt_values = []
for i, v in enumerate(vals):
if not is_float_type[i] and leading_space:
- fmt_values.append(' {v}'.format(v=_format(v)))
+ fmt_values.append(" {v}".format(v=_format(v)))
elif is_float_type[i]:
fmt_values.append(float_format(v))
else:
if leading_space is False:
# False specifically, so that the default is
# to include a space if we get here.
- tpl = '{v}'
+ tpl = "{v}"
else:
- tpl = ' {v}'
+ tpl = " {v}"
fmt_values.append(tpl.format(v=_format(v)))
return fmt_values
@@ -1047,15 +1161,20 @@ def _value_formatter(self, float_format=None, threshold=None):
# when there is no float_format, we use str instead of '%g'
# because str(0.0) = '0.0' while '%g' % 0.0 = '0'
if float_format:
+
def base_formatter(v):
return float_format(value=v) if notna(v) else self.na_rep
+
else:
+
def base_formatter(v):
return str(v) if notna(v) else self.na_rep
- if self.decimal != '.':
+ if self.decimal != ".":
+
def decimal_formatter(v):
- return base_formatter(v).replace('.', self.decimal, 1)
+ return base_formatter(v).replace(".", self.decimal, 1)
+
else:
decimal_formatter = base_formatter
@@ -1093,8 +1212,8 @@ def format_values_with(float_format):
# default formatter leaves a space to the left when formatting
# floats, must be consistent for left-justifying NaNs (GH #25061)
- if self.justify == 'left':
- na_rep = ' ' + self.na_rep
+ if self.justify == "left":
+ na_rep = " " + self.na_rep
else:
na_rep = self.na_rep
@@ -1102,13 +1221,14 @@ def format_values_with(float_format):
values = self.values
is_complex = is_complex_dtype(values)
mask = isna(values)
- if hasattr(values, 'to_dense'): # sparse numpy ndarray
+ if hasattr(values, "to_dense"): # sparse numpy ndarray
values = values.to_dense()
- values = np.array(values, dtype='object')
+ values = np.array(values, dtype="object")
values[mask] = na_rep
imask = (~mask).ravel()
- values.flat[imask] = np.array([formatter(val)
- for val in values.ravel()[imask]])
+ values.flat[imask] = np.array(
+ [formatter(val) for val in values.ravel()[imask]]
+ )
if self.fixed_width:
if is_complex:
@@ -1122,8 +1242,9 @@ def format_values_with(float_format):
# The default is otherwise to use str instead of a formatting string
if self.float_format is None:
if self.fixed_width:
- float_format = partial('{value: .{digits:d}f}'.format,
- digits=self.digits)
+ float_format = partial(
+ "{value: .{digits:d}f}".format, digits=self.digits
+ )
else:
float_format = self.float_format
else:
@@ -1144,18 +1265,18 @@ def format_values_with(float_format):
else:
too_long = False
- with np.errstate(invalid='ignore'):
+ with np.errstate(invalid="ignore"):
abs_vals = np.abs(self.values)
# this is pretty arbitrary for now
# large values: more that 8 characters including decimal symbol
# and first digit, hence > 1e6
has_large_values = (abs_vals > 1e6).any()
- has_small_values = ((abs_vals < 10**(-self.digits)) &
- (abs_vals > 0)).any()
+ has_small_values = (
+ (abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)
+ ).any()
if has_small_values or (too_long and has_large_values):
- float_format = partial('{value: .{digits:d}e}'.format,
- digits=self.digits)
+ float_format = partial("{value: .{digits:d}e}".format, digits=self.digits)
formatted_values = format_values_with(float_format)
return formatted_values
@@ -1169,16 +1290,14 @@ def _format_strings(self):
class IntArrayFormatter(GenericArrayFormatter):
-
def _format_strings(self):
- formatter = self.formatter or (lambda x: '{x: d}'.format(x=x))
+ formatter = self.formatter or (lambda x: "{x: d}".format(x=x))
fmt_values = [formatter(x) for x in self.values]
return fmt_values
class Datetime64Formatter(GenericArrayFormatter):
-
- def __init__(self, values, nat_rep='NaT', date_format=None, **kwargs):
+ def __init__(self, values, nat_rep="NaT", date_format=None, **kwargs):
super().__init__(values, **kwargs)
self.nat_rep = nat_rep
self.date_format = date_format
@@ -1196,9 +1315,9 @@ def _format_strings(self):
fmt_values = format_array_from_datetime(
values.asi8.ravel(),
- format=_get_format_datetime64_from_values(values,
- self.date_format),
- na_rep=self.nat_rep).reshape(values.shape)
+ format=_get_format_datetime64_from_values(values, self.date_format),
+ na_rep=self.nat_rep,
+ ).reshape(values.shape)
return fmt_values.tolist()
@@ -1216,12 +1335,16 @@ def _format_strings(self):
else:
array = np.asarray(values)
- fmt_values = format_array(array,
- formatter,
- float_format=self.float_format,
- na_rep=self.na_rep, digits=self.digits,
- space=self.space, justify=self.justify,
- leading_space=self.leading_space)
+ fmt_values = format_array(
+ array,
+ formatter,
+ float_format=self.float_format,
+ na_rep=self.na_rep,
+ digits=self.digits,
+ space=self.space,
+ justify=self.justify,
+ leading_space=self.leading_space,
+ )
return fmt_values
@@ -1261,9 +1384,12 @@ def format_percentiles(percentiles):
percentiles = np.asarray(percentiles)
# It checks for np.NaN as well
- with np.errstate(invalid='ignore'):
- if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \
- or not np.all(percentiles <= 1):
+ with np.errstate(invalid="ignore"):
+ if (
+ not is_numeric_dtype(percentiles)
+ or not np.all(percentiles >= 0)
+ or not np.all(percentiles <= 1)
+ ):
raise ValueError("percentiles should all be in the interval [0,1]")
percentiles = 100 * percentiles
@@ -1271,21 +1397,21 @@ def format_percentiles(percentiles):
if np.all(int_idx):
out = percentiles.astype(int).astype(str)
- return [i + '%' for i in out]
+ return [i + "%" for i in out]
unique_pcts = np.unique(percentiles)
to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None
to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None
# Least precision that keeps percentiles unique after rounding
- prec = -np.floor(np.log10(np.min(
- np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end)
- ))).astype(int)
+ prec = -np.floor(
+ np.log10(np.min(np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end)))
+ ).astype(int)
prec = max(1, prec)
out = np.empty_like(percentiles, dtype=object)
out[int_idx] = percentiles[int_idx].astype(int).astype(str)
out[~int_idx] = percentiles[~int_idx].round(prec).astype(str)
- return [i + '%' for i in out]
+ return [i + "%" for i in out]
def _is_dates_only(values):
@@ -1298,20 +1424,21 @@ def _is_dates_only(values):
values_int = values.asi8
consider_values = values_int != iNaT
- one_day_nanos = (86400 * 1e9)
- even_days = np.logical_and(consider_values,
- values_int % int(one_day_nanos) != 0).sum() == 0
+ one_day_nanos = 86400 * 1e9
+ even_days = (
+ np.logical_and(consider_values, values_int % int(one_day_nanos) != 0).sum() == 0
+ )
if even_days:
return True
return False
-def _format_datetime64(x, tz=None, nat_rep='NaT'):
+def _format_datetime64(x, tz=None, nat_rep="NaT"):
if x is None or (is_scalar(x) and isna(x)):
return nat_rep
if tz is not None or not isinstance(x, Timestamp):
- if getattr(x, 'tzinfo', None) is not None:
+ if getattr(x, "tzinfo", None) is not None:
x = Timestamp(x).tz_convert(tz)
else:
x = Timestamp(x).tz_localize(tz)
@@ -1319,7 +1446,7 @@ def _format_datetime64(x, tz=None, nat_rep='NaT'):
return str(x)
-def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None):
+def _format_datetime64_dateonly(x, nat_rep="NaT", date_format=None):
if x is None or (is_scalar(x) and isna(x)):
return nat_rep
@@ -1332,11 +1459,12 @@ def _format_datetime64_dateonly(x, nat_rep='NaT', date_format=None):
return x._date_repr
-def _get_format_datetime64(is_dates_only, nat_rep='NaT', date_format=None):
+def _get_format_datetime64(is_dates_only, nat_rep="NaT", date_format=None):
if is_dates_only:
return lambda x, tz=None: _format_datetime64_dateonly(
- x, nat_rep=nat_rep, date_format=date_format)
+ x, nat_rep=nat_rep, date_format=date_format
+ )
else:
return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep)
@@ -1356,36 +1484,34 @@ def _get_format_datetime64_from_values(values, date_format):
class Datetime64TZFormatter(Datetime64Formatter):
-
def _format_strings(self):
""" we by definition have a TZ """
values = self.values.astype(object)
is_dates_only = _is_dates_only(values)
- formatter = (self.formatter or
- _get_format_datetime64(is_dates_only,
- date_format=self.date_format))
+ formatter = self.formatter or _get_format_datetime64(
+ is_dates_only, date_format=self.date_format
+ )
fmt_values = [formatter(x) for x in values]
return fmt_values
class Timedelta64Formatter(GenericArrayFormatter):
-
- def __init__(self, values, nat_rep='NaT', box=False, **kwargs):
+ def __init__(self, values, nat_rep="NaT", box=False, **kwargs):
super().__init__(values, **kwargs)
self.nat_rep = nat_rep
self.box = box
def _format_strings(self):
- formatter = (self.formatter or
- _get_format_timedelta64(self.values, nat_rep=self.nat_rep,
- box=self.box))
+ formatter = self.formatter or _get_format_timedelta64(
+ self.values, nat_rep=self.nat_rep, box=self.box
+ )
fmt_values = np.array([formatter(x) for x in self.values])
return fmt_values
-def _get_format_timedelta64(values, nat_rep='NaT', box=False):
+def _get_format_timedelta64(values, nat_rep="NaT", box=False):
"""
Return a formatter function for a range of timedeltas.
These will all have the same format argument
@@ -1397,18 +1523,20 @@ def _get_format_timedelta64(values, nat_rep='NaT', box=False):
consider_values = values_int != iNaT
- one_day_nanos = (86400 * 1e9)
- even_days = np.logical_and(consider_values,
- values_int % one_day_nanos != 0).sum() == 0
- all_sub_day = np.logical_and(
- consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0
+ one_day_nanos = 86400 * 1e9
+ even_days = (
+ np.logical_and(consider_values, values_int % one_day_nanos != 0).sum() == 0
+ )
+ all_sub_day = (
+ np.logical_and(consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0
+ )
if even_days:
format = None
elif all_sub_day:
- format = 'sub_day'
+ format = "sub_day"
else:
- format = 'long'
+ format = "long"
def _formatter(x):
if x is None or (is_scalar(x) and isna(x)):
@@ -1424,9 +1552,9 @@ def _formatter(x):
return _formatter
-def _make_fixed_width(strings, justify='right', minimum=None, adj=None):
+def _make_fixed_width(strings, justify="right", minimum=None, adj=None):
- if len(strings) == 0 or justify == 'all':
+ if len(strings) == 0 or justify == "all":
return strings
if adj is None:
@@ -1444,7 +1572,7 @@ def _make_fixed_width(strings, justify='right', minimum=None, adj=None):
def just(x):
if conf_max is not None:
if (conf_max > 3) & (adj.len(x) > max_len):
- x = x[:max_len - 3] + '...'
+ x = x[: max_len - 3] + "..."
return x
strings = [just(x) for x in strings]
@@ -1452,41 +1580,46 @@ def just(x):
return result
-def _trim_zeros_complex(str_complexes, na_rep='NaN'):
+def _trim_zeros_complex(str_complexes, na_rep="NaN"):
"""
Separates the real and imaginary parts from the complex number, and
executes the _trim_zeros_float method on each of those.
"""
+
def separate_and_trim(str_complex, na_rep):
- num_arr = str_complex.split('+')
- return (_trim_zeros_float([num_arr[0]], na_rep) +
- ['+'] +
- _trim_zeros_float([num_arr[1][:-1]], na_rep) +
- ['j'])
+ num_arr = str_complex.split("+")
+ return (
+ _trim_zeros_float([num_arr[0]], na_rep)
+ + ["+"]
+ + _trim_zeros_float([num_arr[1][:-1]], na_rep)
+ + ["j"]
+ )
- return [''.join(separate_and_trim(x, na_rep)) for x in str_complexes]
+ return ["".join(separate_and_trim(x, na_rep)) for x in str_complexes]
-def _trim_zeros_float(str_floats, na_rep='NaN'):
+def _trim_zeros_float(str_floats, na_rep="NaN"):
"""
Trims zeros, leaving just one before the decimal points if need be.
"""
trimmed = str_floats
def _is_number(x):
- return (x != na_rep and not x.endswith('inf'))
+ return x != na_rep and not x.endswith("inf")
def _cond(values):
finite = [x for x in values if _is_number(x)]
- return (len(finite) > 0 and all(x.endswith('0') for x in finite) and
- not (any(('e' in x) or ('E' in x) for x in finite)))
+ return (
+ len(finite) > 0
+ and all(x.endswith("0") for x in finite)
+ and not (any(("e" in x) or ("E" in x) for x in finite))
+ )
while _cond(trimmed):
trimmed = [x[:-1] if _is_number(x) else x for x in trimmed]
# leave one 0 after the decimal points if need be.
- return [x + "0" if x.endswith('.') and _is_number(x) else x
- for x in trimmed]
+ return [x + "0" if x.endswith(".") and _is_number(x) else x for x in trimmed]
def _has_names(index):
@@ -1521,7 +1654,7 @@ class EngFormatter:
15: "P",
18: "E",
21: "Z",
- 24: "Y"
+ 24: "Y",
}
def __init__(self, accuracy=None, use_eng_prefix=False):
@@ -1551,13 +1684,14 @@ def __call__(self, num):
"""
import decimal
import math
+
dnum = decimal.Decimal(str(num))
if decimal.Decimal.is_nan(dnum):
- return 'NaN'
+ return "NaN"
if decimal.Decimal.is_infinite(dnum):
- return 'inf'
+ return "inf"
sign = 1
@@ -1578,17 +1712,16 @@ def __call__(self, num):
prefix = self.ENG_PREFIXES[int_pow10]
else:
if int_pow10 < 0:
- prefix = 'E-{pow10:02d}'.format(pow10=-int_pow10)
+ prefix = "E-{pow10:02d}".format(pow10=-int_pow10)
else:
- prefix = 'E+{pow10:02d}'.format(pow10=int_pow10)
+ prefix = "E+{pow10:02d}".format(pow10=int_pow10)
- mant = sign * dnum / (10**pow10)
+ mant = sign * dnum / (10 ** pow10)
if self.accuracy is None: # pragma: no cover
format_str = "{mant: g}{prefix}"
else:
- format_str = ("{{mant: .{acc:d}f}}{{prefix}}"
- .format(acc=self.accuracy))
+ format_str = "{{mant: .{acc:d}f}}{{prefix}}".format(acc=self.accuracy)
formatted = format_str.format(mant=mant, prefix=prefix)
@@ -1628,7 +1761,7 @@ def _binify(cols, line_width):
return bins
-def get_level_lengths(levels, sentinel=''):
+def get_level_lengths(levels, sentinel=""):
"""For each index in each level the function returns lengths of indexes.
Parameters
@@ -1681,4 +1814,4 @@ def buffer_put_lines(buf, lines):
"""
if any(isinstance(x, str) for x in lines):
lines = [str(x) for x in lines]
- buf.write('\n'.join(lines))
+ buf.write("\n".join(lines))
diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py
index 6fc36324092b5..e6aae44baa69b 100644
--- a/pandas/io/formats/html.py
+++ b/pandas/io/formats/html.py
@@ -35,17 +35,16 @@ def __init__(self, formatter, classes=None, border=None):
self.frame = self.fmt.frame
self.columns = self.fmt.tr_frame.columns
self.elements = []
- self.bold_rows = self.fmt.kwds.get('bold_rows', False)
- self.escape = self.fmt.kwds.get('escape', True)
+ self.bold_rows = self.fmt.kwds.get("bold_rows", False)
+ self.escape = self.fmt.kwds.get("escape", True)
self.show_dimensions = self.fmt.show_dimensions
if border is None:
- border = get_option('display.html.border')
+ border = get_option("display.html.border")
self.border = border
self.table_id = self.fmt.table_id
self.render_links = self.fmt.render_links
if isinstance(self.fmt.col_space, int):
- self.fmt.col_space = ('{colspace}px'
- .format(colspace=self.fmt.col_space))
+ self.fmt.col_space = "{colspace}px".format(colspace=self.fmt.col_space)
@property
def show_row_idx_names(self):
@@ -83,7 +82,7 @@ def ncols(self):
def write(self, s, indent=0):
rs = pprint_thing(s)
- self.elements.append(' ' * indent + rs)
+ self.elements.append(" " * indent + rs)
def write_th(self, s, header=False, indent=0, tags=None):
"""
@@ -109,25 +108,23 @@ def write_th(self, s, header=False, indent=0, tags=None):
A written cell.
"""
if header and self.fmt.col_space is not None:
- tags = (tags or "")
- tags += ('style="min-width: {colspace};"'
- .format(colspace=self.fmt.col_space))
+ tags = tags or ""
+ tags += 'style="min-width: {colspace};"'.format(colspace=self.fmt.col_space)
- return self._write_cell(s, kind='th', indent=indent, tags=tags)
+ return self._write_cell(s, kind="th", indent=indent, tags=tags)
def write_td(self, s, indent=0, tags=None):
- return self._write_cell(s, kind='td', indent=indent, tags=tags)
+ return self._write_cell(s, kind="td", indent=indent, tags=tags)
- def _write_cell(self, s, kind='td', indent=0, tags=None):
+ def _write_cell(self, s, kind="td", indent=0, tags=None):
if tags is not None:
- start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags)
+ start_tag = "<{kind} {tags}>".format(kind=kind, tags=tags)
else:
- start_tag = '<{kind}>'.format(kind=kind)
+ start_tag = "<{kind}>".format(kind=kind)
if self.escape:
# escape & first to prevent double escaping of &
- esc = OrderedDict([('&', r'&'), ('<', r'<'),
- ('>', r'>')])
+ esc = OrderedDict([("&", r"&"), ("<", r"<"), (">", r">")])
else:
esc = {}
@@ -135,25 +132,35 @@ def _write_cell(self, s, kind='td', indent=0, tags=None):
if self.render_links and _is_url(rs):
rs_unescaped = pprint_thing(s, escape_chars={}).strip()
- start_tag += ''.format(
- url=rs_unescaped)
- end_a = ''
+ start_tag += ''.format(url=rs_unescaped)
+ end_a = ""
else:
- end_a = ''
-
- self.write('{start}{rs}{end_a}{kind}>'.format(
- start=start_tag, rs=rs, end_a=end_a, kind=kind), indent)
-
- def write_tr(self, line, indent=0, indent_delta=0, header=False,
- align=None, tags=None, nindex_levels=0):
+ end_a = ""
+
+ self.write(
+ "{start}{rs}{end_a}{kind}>".format(
+ start=start_tag, rs=rs, end_a=end_a, kind=kind
+ ),
+ indent,
+ )
+
+ def write_tr(
+ self,
+ line,
+ indent=0,
+ indent_delta=0,
+ header=False,
+ align=None,
+ tags=None,
+ nindex_levels=0,
+ ):
if tags is None:
tags = {}
if align is None:
- self.write(' | ', indent)
+ self.write("
", indent)
else:
- self.write('
'
- .format(align=align), indent)
+ self.write('
'.format(align=align), indent)
indent += indent_delta
for i, s in enumerate(line):
@@ -164,31 +171,34 @@ def write_tr(self, line, indent=0, indent_delta=0, header=False,
self.write_td(s, indent, tags=val_tag)
indent -= indent_delta
- self.write('
', indent)
+ self.write("", indent)
def render(self):
self._write_table()
if self.should_show_dimensions:
by = chr(215) # ×
- self.write('{rows} rows {by} {cols} columns
'
- .format(rows=len(self.frame),
- by=by,
- cols=len(self.frame.columns)))
+ self.write(
+ "{rows} rows {by} {cols} columns
".format(
+ rows=len(self.frame), by=by, cols=len(self.frame.columns)
+ )
+ )
return self.elements
def _write_table(self, indent=0):
- _classes = ['dataframe'] # Default class.
+ _classes = ["dataframe"] # Default class.
use_mathjax = get_option("display.html.use_mathjax")
if not use_mathjax:
- _classes.append('tex2jax_ignore')
+ _classes.append("tex2jax_ignore")
if self.classes is not None:
if isinstance(self.classes, str):
self.classes = self.classes.split()
if not isinstance(self.classes, (list, tuple)):
- raise TypeError('classes must be a string, list, or tuple, '
- 'not {typ}'.format(typ=type(self.classes)))
+ raise TypeError(
+ "classes must be a string, list, or tuple, "
+ "not {typ}".format(typ=type(self.classes))
+ )
_classes.extend(self.classes)
if self.table_id is None:
@@ -196,16 +206,19 @@ def _write_table(self, indent=0):
else:
id_section = ' id="{table_id}"'.format(table_id=self.table_id)
- self.write(''
- .format(border=self.border, cls=' '.join(_classes),
- id_section=id_section), indent)
+ self.write(
+ ''.format(
+ border=self.border, cls=" ".join(_classes), id_section=id_section
+ ),
+ indent,
+ )
if self.fmt.header or self.show_row_idx_names:
self._write_header(indent + self.indent_delta)
self._write_body(indent + self.indent_delta)
- self.write('
', indent)
+ self.write("
", indent)
def _write_col_header(self, indent):
truncate_h = self.fmt.truncate_h
@@ -217,12 +230,10 @@ def _write_col_header(self, indent):
sentinel = object()
else:
sentinel = False
- levels = self.columns.format(sparsify=sentinel, adjoin=False,
- names=False)
+ levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
- for lnum, (records, values) in enumerate(zip(level_lengths,
- levels)):
+ for lnum, (records, values) in enumerate(zip(level_lengths, levels)):
if truncate_h:
# modify the header lines
ins_col = self.fmt.tr_col_num
@@ -235,21 +246,23 @@ def _write_col_header(self, indent):
elif tag + span > ins_col:
recs_new[tag] = span + 1
if lnum == inner_lvl:
- values = (values[:ins_col] + ('...',) +
- values[ins_col:])
+ values = (
+ values[:ins_col] + ("...",) + values[ins_col:]
+ )
else:
# sparse col headers do not receive a ...
- values = (values[:ins_col] +
- (values[ins_col - 1], ) +
- values[ins_col:])
+ values = (
+ values[:ins_col]
+ + (values[ins_col - 1],)
+ + values[ins_col:]
+ )
else:
recs_new[tag] = span
# if ins_col lies between tags, all col headers
# get ...
if tag + span == ins_col:
recs_new[ins_col] = 1
- values = (values[:ins_col] + ('...',) +
- values[ins_col:])
+ values = values[:ins_col] + ("...",) + values[ins_col:]
records = recs_new
inner_lvl = len(level_lengths) - 1
if lnum == inner_lvl:
@@ -263,8 +276,7 @@ def _write_col_header(self, indent):
recs_new[tag] = span
recs_new[ins_col] = 1
records = recs_new
- values = (values[:ins_col] + ['...'] +
- values[ins_col:])
+ values = values[:ins_col] + ["..."] + values[ins_col:]
# see gh-22579
# Column Offset Bug with to_html(index=False) with
@@ -272,7 +284,7 @@ def _write_col_header(self, indent):
# Initially fill row with blank cells before column names.
# TODO: Refactor to remove code duplication with code
# block below for standard columns index.
- row = [''] * (self.row_levels - 1)
+ row = [""] * (self.row_levels - 1)
if self.fmt.index or self.show_col_idx_names:
# see gh-22747
# If to_html(index_names=False) do not show columns
@@ -283,9 +295,9 @@ def _write_col_header(self, indent):
# parity with DataFrameFormatter class.
if self.fmt.show_index_names:
name = self.columns.names[lnum]
- row.append(pprint_thing(name or ''))
+ row.append(pprint_thing(name or ""))
else:
- row.append('')
+ row.append("")
tags = {}
j = len(row)
@@ -297,8 +309,7 @@ def _write_col_header(self, indent):
continue
j += 1
row.append(v)
- self.write_tr(row, indent, self.indent_delta, tags=tags,
- header=True)
+ self.write_tr(row, indent, self.indent_delta, tags=tags, header=True)
else:
# see gh-22579
# Column misalignment also occurs for
@@ -306,7 +317,7 @@ def _write_col_header(self, indent):
# Initially fill row with blank cells before column names.
# TODO: Refactor to remove code duplication with code block
# above for columns MultiIndex.
- row = [''] * (self.row_levels - 1)
+ row = [""] * (self.row_levels - 1)
if self.fmt.index or self.show_col_idx_names:
# see gh-22747
# If to_html(index_names=False) do not show columns
@@ -314,27 +325,27 @@ def _write_col_header(self, indent):
# TODO: Refactor to use _get_column_name_list from
# DataFrameFormatter class.
if self.fmt.show_index_names:
- row.append(self.columns.name or '')
+ row.append(self.columns.name or "")
else:
- row.append('')
+ row.append("")
row.extend(self._get_columns_formatted_values())
align = self.fmt.justify
if truncate_h:
ins_col = self.row_levels + self.fmt.tr_col_num
- row.insert(ins_col, '...')
+ row.insert(ins_col, "...")
- self.write_tr(row, indent, self.indent_delta, header=True,
- align=align)
+ self.write_tr(row, indent, self.indent_delta, header=True, align=align)
def _write_row_header(self, indent):
truncate_h = self.fmt.truncate_h
- row = ([x if x is not None else '' for x in self.frame.index.names]
- + [''] * (self.ncols + (1 if truncate_h else 0)))
+ row = [x if x is not None else "" for x in self.frame.index.names] + [""] * (
+ self.ncols + (1 if truncate_h else 0)
+ )
self.write_tr(row, indent, self.indent_delta, header=True)
def _write_header(self, indent):
- self.write('', indent)
+ self.write("", indent)
if self.fmt.header:
self._write_col_header(indent + self.indent_delta)
@@ -342,27 +353,24 @@ def _write_header(self, indent):
if self.show_row_idx_names:
self._write_row_header(indent + self.indent_delta)
- self.write('', indent)
+ self.write("", indent)
def _get_formatted_values(self):
- with option_context('display.max_colwidth', 999999):
- fmt_values = {i: self.fmt._format_col(i)
- for i in range(self.ncols)}
+ with option_context("display.max_colwidth", 999999):
+ fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)}
return fmt_values
def _write_body(self, indent):
- self.write('', indent)
+ self.write("", indent)
fmt_values = self._get_formatted_values()
# write values
if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
- self._write_hierarchical_rows(
- fmt_values, indent + self.indent_delta)
+ self._write_hierarchical_rows(fmt_values, indent + self.indent_delta)
else:
- self._write_regular_rows(
- fmt_values, indent + self.indent_delta)
+ self._write_regular_rows(fmt_values, indent + self.indent_delta)
- self.write('', indent)
+ self.write("", indent)
def _write_regular_rows(self, fmt_values, indent):
truncate_h = self.fmt.truncate_h
@@ -371,7 +379,7 @@ def _write_regular_rows(self, fmt_values, indent):
nrows = len(self.fmt.tr_frame)
if self.fmt.index:
- fmt = self.fmt._get_formatter('__index__')
+ fmt = self.fmt._get_formatter("__index__")
if fmt is not None:
index_values = self.fmt.tr_frame.index.map(fmt)
else:
@@ -381,9 +389,14 @@ def _write_regular_rows(self, fmt_values, indent):
for i in range(nrows):
if truncate_v and i == (self.fmt.tr_row_num):
- str_sep_row = ['...'] * len(row)
- self.write_tr(str_sep_row, indent, self.indent_delta,
- tags=None, nindex_levels=self.row_levels)
+ str_sep_row = ["..."] * len(row)
+ self.write_tr(
+ str_sep_row,
+ indent,
+ self.indent_delta,
+ tags=None,
+ nindex_levels=self.row_levels,
+ )
row = []
if self.fmt.index:
@@ -393,14 +406,15 @@ def _write_regular_rows(self, fmt_values, indent):
# a standard index when the columns index is named.
# Add blank cell before data cells.
elif self.show_col_idx_names:
- row.append('')
+ row.append("")
row.extend(fmt_values[j][i] for j in range(self.ncols))
if truncate_h:
dot_col_ix = self.fmt.tr_col_num + self.row_levels
- row.insert(dot_col_ix, '...')
- self.write_tr(row, indent, self.indent_delta, tags=None,
- nindex_levels=self.row_levels)
+ row.insert(dot_col_ix, "...")
+ self.write_tr(
+ row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels
+ )
def _write_hierarchical_rows(self, fmt_values, indent):
template = 'rowspan="{span}" valign="top"'
@@ -410,15 +424,13 @@ def _write_hierarchical_rows(self, fmt_values, indent):
frame = self.fmt.tr_frame
nrows = len(frame)
- idx_values = frame.index.format(sparsify=False, adjoin=False,
- names=False)
+ idx_values = frame.index.format(sparsify=False, adjoin=False, names=False)
idx_values = list(zip(*idx_values))
if self.fmt.sparsify:
# GH3547
sentinel = object()
- levels = frame.index.format(sparsify=sentinel, adjoin=False,
- names=False)
+ levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False)
level_lengths = get_level_lengths(levels, sentinel)
inner_lvl = len(level_lengths) - 1
@@ -438,12 +450,12 @@ def _write_hierarchical_rows(self, fmt_values, indent):
# GH 14882 - Make sure insertion done once
if not inserted:
dot_row = list(idx_values[ins_row - 1])
- dot_row[-1] = '...'
+ dot_row[-1] = "..."
idx_values.insert(ins_row, tuple(dot_row))
inserted = True
else:
dot_row = list(idx_values[ins_row])
- dot_row[inner_lvl - lnum] = '...'
+ dot_row[inner_lvl - lnum] = "..."
idx_values[ins_row] = tuple(dot_row)
else:
rec_new[tag] = span
@@ -452,19 +464,20 @@ def _write_hierarchical_rows(self, fmt_values, indent):
if tag + span == ins_row:
rec_new[ins_row] = 1
if lnum == 0:
- idx_values.insert(ins_row, tuple(
- ['...'] * len(level_lengths)))
+ idx_values.insert(
+ ins_row, tuple(["..."] * len(level_lengths))
+ )
# GH 14882 - Place ... in correct level
elif inserted:
dot_row = list(idx_values[ins_row])
- dot_row[inner_lvl - lnum] = '...'
+ dot_row[inner_lvl - lnum] = "..."
idx_values[ins_row] = tuple(dot_row)
level_lengths[lnum] = rec_new
level_lengths[inner_lvl][ins_row] = 1
for ix_col in range(len(fmt_values)):
- fmt_values[ix_col].insert(ins_row, '...')
+ fmt_values[ix_col].insert(ins_row, "...")
nrows += 1
for i in range(nrows):
@@ -486,27 +499,44 @@ def _write_hierarchical_rows(self, fmt_values, indent):
row.extend(fmt_values[j][i] for j in range(self.ncols))
if truncate_h:
- row.insert(self.row_levels - sparse_offset +
- self.fmt.tr_col_num, '...')
- self.write_tr(row, indent, self.indent_delta, tags=tags,
- nindex_levels=len(levels) - sparse_offset)
+ row.insert(
+ self.row_levels - sparse_offset + self.fmt.tr_col_num, "..."
+ )
+ self.write_tr(
+ row,
+ indent,
+ self.indent_delta,
+ tags=tags,
+ nindex_levels=len(levels) - sparse_offset,
+ )
else:
row = []
for i in range(len(frame)):
if truncate_v and i == (self.fmt.tr_row_num):
- str_sep_row = ['...'] * len(row)
- self.write_tr(str_sep_row, indent, self.indent_delta,
- tags=None, nindex_levels=self.row_levels)
-
- idx_values = list(zip(*frame.index.format(
- sparsify=False, adjoin=False, names=False)))
+ str_sep_row = ["..."] * len(row)
+ self.write_tr(
+ str_sep_row,
+ indent,
+ self.indent_delta,
+ tags=None,
+ nindex_levels=self.row_levels,
+ )
+
+ idx_values = list(
+ zip(*frame.index.format(sparsify=False, adjoin=False, names=False))
+ )
row = []
row.extend(idx_values[i])
row.extend(fmt_values[j][i] for j in range(self.ncols))
if truncate_h:
- row.insert(self.row_levels + self.fmt.tr_col_num, '...')
- self.write_tr(row, indent, self.indent_delta, tags=None,
- nindex_levels=frame.index.nlevels)
+ row.insert(self.row_levels + self.fmt.tr_col_num, "...")
+ self.write_tr(
+ row,
+ indent,
+ self.indent_delta,
+ tags=None,
+ nindex_levels=frame.index.nlevels,
+ )
class NotebookFormatter(HTMLFormatter):
@@ -534,34 +564,25 @@ def write_style(self):
.dataframe %s {
%s: %s;
}"""
- element_props = [('tbody tr th:only-of-type',
- 'vertical-align',
- 'middle'),
- ('tbody tr th',
- 'vertical-align',
- 'top')]
+ element_props = [
+ ("tbody tr th:only-of-type", "vertical-align", "middle"),
+ ("tbody tr th", "vertical-align", "top"),
+ ]
if isinstance(self.columns, ABCMultiIndex):
- element_props.append(('thead tr th',
- 'text-align',
- 'left'))
+ element_props.append(("thead tr th", "text-align", "left"))
if self.show_row_idx_names:
- element_props.append(('thead tr:last-of-type th',
- 'text-align',
- 'right'))
+ element_props.append(
+ ("thead tr:last-of-type th", "text-align", "right")
+ )
else:
- element_props.append(('thead th',
- 'text-align',
- 'right'))
- template_mid = '\n\n'.join(map(lambda t: template_select % t,
- element_props))
- template = dedent('\n'.join((template_first,
- template_mid,
- template_last)))
+ element_props.append(("thead th", "text-align", "right"))
+ template_mid = "\n\n".join(map(lambda t: template_select % t, element_props))
+ template = dedent("\n".join((template_first, template_mid, template_last)))
self.write(template)
def render(self):
- self.write('')
+ self.write("
")
self.write_style()
super().render()
- self.write('
')
+ self.write("
")
return self.elements
diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py
index 33bc413e9c3fe..dad099b747701 100644
--- a/pandas/io/formats/latex.py
+++ b/pandas/io/formats/latex.py
@@ -26,11 +26,18 @@ class LatexFormatter(TableFormatter):
HTMLFormatter
"""
- def __init__(self, formatter, column_format=None, longtable=False,
- multicolumn=False, multicolumn_format=None, multirow=False):
+ def __init__(
+ self,
+ formatter,
+ column_format=None,
+ longtable=False,
+ multicolumn=False,
+ multicolumn_format=None,
+ multirow=False,
+ ):
self.fmt = formatter
self.frame = self.fmt.frame
- self.bold_rows = self.fmt.kwds.get('bold_rows', False)
+ self.bold_rows = self.fmt.kwds.get("bold_rows", False)
self.column_format = column_format
self.longtable = longtable
self.multicolumn = multicolumn
@@ -44,25 +51,28 @@ def write_result(self, buf):
# string representation of the columns
if len(self.frame.columns) == 0 or len(self.frame.index) == 0:
- info_line = ('Empty {name}\nColumns: {col}\nIndex: {idx}'
- .format(name=type(self.frame).__name__,
- col=self.frame.columns,
- idx=self.frame.index))
+ info_line = "Empty {name}\nColumns: {col}\nIndex: {idx}".format(
+ name=type(self.frame).__name__,
+ col=self.frame.columns,
+ idx=self.frame.index,
+ )
strcols = [[info_line]]
else:
strcols = self.fmt._to_str_columns()
def get_col_type(dtype):
if issubclass(dtype.type, np.number):
- return 'r'
+ return "r"
else:
- return 'l'
+ return "l"
# reestablish the MultiIndex that has been joined by _to_str_column
if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
out = self.frame.index.format(
- adjoin=False, sparsify=self.fmt.sparsify,
- names=self.fmt.has_index_names, na_rep=self.fmt.na_rep
+ adjoin=False,
+ sparsify=self.fmt.sparsify,
+ names=self.fmt.has_index_names,
+ na_rep=self.fmt.na_rep,
)
# index.format will sparsify repeated entries with empty strings
@@ -71,17 +81,18 @@ def pad_empties(x):
for pad in reversed(x):
if pad:
break
- return [x[0]] + [i if i else ' ' * len(pad) for i in x[1:]]
+ return [x[0]] + [i if i else " " * len(pad) for i in x[1:]]
+
out = (pad_empties(i) for i in out)
# Add empty spaces for each column level
clevels = self.frame.columns.nlevels
- out = [[' ' * len(i[-1])] * clevels + i for i in out]
+ out = [[" " * len(i[-1])] * clevels + i for i in out]
# Add the column names to the last index column
cnames = self.frame.columns.names
if any(cnames):
- new_names = [i if i else '{}' for i in cnames]
+ new_names = [i if i else "{}" for i in cnames]
out[self.frame.index.nlevels - 1][:clevels] = new_names
# Get rid of old multiindex column and add new ones
@@ -90,22 +101,22 @@ def pad_empties(x):
column_format = self.column_format
if column_format is None:
dtypes = self.frame.dtypes._values
- column_format = ''.join(map(get_col_type, dtypes))
+ column_format = "".join(map(get_col_type, dtypes))
if self.fmt.index:
- index_format = 'l' * self.frame.index.nlevels
+ index_format = "l" * self.frame.index.nlevels
column_format = index_format + column_format
elif not isinstance(column_format, str): # pragma: no cover
- raise AssertionError('column_format must be str or unicode, '
- 'not {typ}'.format(typ=type(column_format)))
+ raise AssertionError(
+ "column_format must be str or unicode, "
+ "not {typ}".format(typ=type(column_format))
+ )
if not self.longtable:
- buf.write('\\begin{{tabular}}{{{fmt}}}\n'
- .format(fmt=column_format))
- buf.write('\\toprule\n')
+ buf.write("\\begin{{tabular}}{{{fmt}}}\n".format(fmt=column_format))
+ buf.write("\\toprule\n")
else:
- buf.write('\\begin{{longtable}}{{{fmt}}}\n'
- .format(fmt=column_format))
- buf.write('\\toprule\n')
+ buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format))
+ buf.write("\\toprule\n")
ilevels = self.frame.index.nlevels
clevels = self.frame.columns.nlevels
@@ -117,50 +128,63 @@ def pad_empties(x):
for i, row in enumerate(strrows):
if i == nlevels and self.fmt.header:
- buf.write('\\midrule\n') # End of header
+ buf.write("\\midrule\n") # End of header
if self.longtable:
- buf.write('\\endhead\n')
- buf.write('\\midrule\n')
- buf.write('\\multicolumn{{{n}}}{{r}}{{{{Continued on next '
- 'page}}}} \\\\\n'.format(n=len(row)))
- buf.write('\\midrule\n')
- buf.write('\\endfoot\n\n')
- buf.write('\\bottomrule\n')
- buf.write('\\endlastfoot\n')
- if self.fmt.kwds.get('escape', True):
+ buf.write("\\endhead\n")
+ buf.write("\\midrule\n")
+ buf.write(
+ "\\multicolumn{{{n}}}{{r}}{{{{Continued on next "
+ "page}}}} \\\\\n".format(n=len(row))
+ )
+ buf.write("\\midrule\n")
+ buf.write("\\endfoot\n\n")
+ buf.write("\\bottomrule\n")
+ buf.write("\\endlastfoot\n")
+ if self.fmt.kwds.get("escape", True):
# escape backslashes first
- crow = [(x.replace('\\', '\\textbackslash ')
- .replace('_', '\\_')
- .replace('%', '\\%').replace('$', '\\$')
- .replace('#', '\\#').replace('{', '\\{')
- .replace('}', '\\}').replace('~', '\\textasciitilde ')
- .replace('^', '\\textasciicircum ')
- .replace('&', '\\&')
- if (x and x != '{}') else '{}') for x in row]
+ crow = [
+ (
+ x.replace("\\", "\\textbackslash ")
+ .replace("_", "\\_")
+ .replace("%", "\\%")
+ .replace("$", "\\$")
+ .replace("#", "\\#")
+ .replace("{", "\\{")
+ .replace("}", "\\}")
+ .replace("~", "\\textasciitilde ")
+ .replace("^", "\\textasciicircum ")
+ .replace("&", "\\&")
+ if (x and x != "{}")
+ else "{}"
+ )
+ for x in row
+ ]
else:
- crow = [x if x else '{}' for x in row]
+ crow = [x if x else "{}" for x in row]
if self.bold_rows and self.fmt.index:
# bold row labels
- crow = ['\\textbf{{{x}}}'.format(x=x)
- if j < ilevels and x.strip() not in ['', '{}'] else x
- for j, x in enumerate(crow)]
+ crow = [
+ "\\textbf{{{x}}}".format(x=x)
+ if j < ilevels and x.strip() not in ["", "{}"]
+ else x
+ for j, x in enumerate(crow)
+ ]
if i < clevels and self.fmt.header and self.multicolumn:
# sum up columns to multicolumns
crow = self._format_multicolumn(crow, ilevels)
- if (i >= nlevels and self.fmt.index and self.multirow and
- ilevels > 1):
+ if i >= nlevels and self.fmt.index and self.multirow and ilevels > 1:
# sum up rows to multirows
crow = self._format_multirow(crow, ilevels, i, strrows)
- buf.write(' & '.join(crow))
- buf.write(' \\\\\n')
+ buf.write(" & ".join(crow))
+ buf.write(" \\\\\n")
if self.multirow and i < len(strrows) - 1:
self._print_cline(buf, i, len(strcols))
if not self.longtable:
- buf.write('\\bottomrule\n')
- buf.write('\\end{tabular}\n')
+ buf.write("\\bottomrule\n")
+ buf.write("\\end{tabular}\n")
else:
- buf.write('\\end{longtable}\n')
+ buf.write("\\end{longtable}\n")
def _format_multicolumn(self, row, ilevels):
r"""
@@ -174,17 +198,20 @@ def _format_multicolumn(self, row, ilevels):
"""
row2 = list(row[:ilevels])
ncol = 1
- coltext = ''
+ coltext = ""
def append_col():
# write multicolumn if needed
if ncol > 1:
- row2.append('\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}'
- .format(ncol=ncol, fmt=self.multicolumn_format,
- txt=coltext.strip()))
+ row2.append(
+ "\\multicolumn{{{ncol:d}}}{{{fmt:s}}}{{{txt:s}}}".format(
+ ncol=ncol, fmt=self.multicolumn_format, txt=coltext.strip()
+ )
+ )
# don't modify where not needed
else:
row2.append(coltext)
+
for c in row[ilevels:]:
# if next col has text, write the previous
if c.strip():
@@ -213,15 +240,16 @@ def _format_multirow(self, row, ilevels, i, rows):
for j in range(ilevels):
if row[j].strip():
nrow = 1
- for r in rows[i + 1:]:
+ for r in rows[i + 1 :]:
if not r[j].strip():
nrow += 1
else:
break
if nrow > 1:
# overwrite non-multirow entry
- row[j] = '\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}'.format(
- nrow=nrow, row=row[j].strip())
+ row[j] = "\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}".format(
+ nrow=nrow, row=row[j].strip()
+ )
# save when to end the current block with \cline
self.clinebuf.append([i + nrow - 1, j + 1])
return row
@@ -232,7 +260,6 @@ def _print_cline(self, buf, i, icol):
"""
for cl in self.clinebuf:
if cl[0] == i:
- buf.write('\\cline{{{cl:d}-{icol:d}}}\n'
- .format(cl=cl[1], icol=icol))
+ buf.write("\\cline{{{cl:d}-{icol:d}}}\n".format(cl=cl[1], icol=icol))
# remove entries that have been written to buffer
self.clinebuf = [x for x in self.clinebuf if x[0] != i]
diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py
index 73d8586a0a8c9..4958d8246610e 100644
--- a/pandas/io/formats/printing.py
+++ b/pandas/io/formats/printing.py
@@ -25,8 +25,8 @@ def adjoin(space, *lists, **kwargs):
justfunc : callable
function used to justify str. Needed for unicode handling.
"""
- strlen = kwargs.pop('strlen', len)
- justfunc = kwargs.pop('justfunc', justify)
+ strlen = kwargs.pop("strlen", len)
+ justfunc = kwargs.pop("justfunc", justify)
out_lines = []
newLists = []
@@ -35,34 +35,33 @@ def adjoin(space, *lists, **kwargs):
lengths.append(max(map(len, lists[-1])))
maxLen = max(map(len, lists))
for i, lst in enumerate(lists):
- nl = justfunc(lst, lengths[i], mode='left')
- nl.extend([' ' * lengths[i]] * (maxLen - len(lst)))
+ nl = justfunc(lst, lengths[i], mode="left")
+ nl.extend([" " * lengths[i]] * (maxLen - len(lst)))
newLists.append(nl)
toJoin = zip(*newLists)
for lines in toJoin:
out_lines.append(_join_unicode(lines))
- return _join_unicode(out_lines, sep='\n')
+ return _join_unicode(out_lines, sep="\n")
-def justify(texts, max_len, mode='right'):
+def justify(texts, max_len, mode="right"):
"""
Perform ljust, center, rjust against string or list-like
"""
- if mode == 'left':
+ if mode == "left":
return [x.ljust(max_len) for x in texts]
- elif mode == 'center':
+ elif mode == "center":
return [x.center(max_len) for x in texts]
else:
return [x.rjust(max_len) for x in texts]
-def _join_unicode(lines, sep=''):
+def _join_unicode(lines, sep=""):
try:
return sep.join(lines)
except UnicodeDecodeError:
sep = str(sep)
- return sep.join([x.decode('utf-8') if isinstance(x, str) else x
- for x in lines])
+ return sep.join([x.decode("utf-8") if isinstance(x, str) else x for x in lines])
# Unicode consolidation
@@ -99,7 +98,7 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):
if isinstance(seq, set):
fmt = "{{{body}}}"
else:
- fmt = "[{body}]" if hasattr(seq, '__setitem__') else "({body})"
+ fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})"
if max_seq_items is False:
nitems = len(seq)
@@ -108,15 +107,16 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):
s = iter(seq)
# handle sets, no slicing
- r = [pprint_thing(next(s),
- _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
- for i in range(min(nitems, len(seq)))]
+ r = [
+ pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
+ for i in range(min(nitems, len(seq)))
+ ]
body = ", ".join(r)
if nitems < len(seq):
body += ", ..."
elif isinstance(seq, tuple) and len(seq) == 1:
- body += ','
+ body += ","
return fmt.format(body=body)
@@ -139,10 +139,10 @@ def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds):
for k, v in list(seq.items())[:nitems]:
pairs.append(
pfmt.format(
- key=pprint_thing(k, _nest_lvl + 1,
- max_seq_items=max_seq_items, **kwds),
- val=pprint_thing(v, _nest_lvl + 1,
- max_seq_items=max_seq_items, **kwds)))
+ key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
+ val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
+ )
+ )
if nitems < len(seq):
return fmt.format(things=", ".join(pairs) + ", ...")
@@ -150,8 +150,14 @@ def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds):
return fmt.format(things=", ".join(pairs))
-def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False,
- quote_strings=False, max_seq_items=None):
+def pprint_thing(
+ thing,
+ _nest_lvl=0,
+ escape_chars=None,
+ default_escapes=False,
+ quote_strings=False,
+ max_seq_items=None,
+):
"""
This function is the sanctioned way of converting objects
to a unicode representation.
@@ -188,9 +194,9 @@ def as_escaped_unicode(thing, escape_chars=escape_chars):
result = str(thing) # we should try this first
except UnicodeDecodeError:
# either utf-8 or we replace errors
- result = str(thing).decode('utf-8', "replace")
+ result = str(thing).decode("utf-8", "replace")
- translate = {'\t': r'\t', '\n': r'\n', '\r': r'\r', }
+ translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"}
if isinstance(escape_chars, dict):
if default_escapes:
translate.update(escape_chars)
@@ -204,17 +210,22 @@ def as_escaped_unicode(thing, escape_chars=escape_chars):
return str(result)
- if hasattr(thing, '__next__'):
+ if hasattr(thing, "__next__"):
return str(thing)
- elif (isinstance(thing, dict) and
- _nest_lvl < get_option("display.pprint_nest_depth")):
- result = _pprint_dict(thing, _nest_lvl, quote_strings=True,
- max_seq_items=max_seq_items)
- elif (is_sequence(thing) and
- _nest_lvl < get_option("display.pprint_nest_depth")):
- result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars,
- quote_strings=quote_strings,
- max_seq_items=max_seq_items)
+ elif isinstance(thing, dict) and _nest_lvl < get_option(
+ "display.pprint_nest_depth"
+ ):
+ result = _pprint_dict(
+ thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
+ )
+ elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
+ result = _pprint_seq(
+ thing,
+ _nest_lvl,
+ escape_chars=escape_chars,
+ quote_strings=quote_strings,
+ max_seq_items=max_seq_items,
+ )
elif isinstance(thing, str) and quote_strings:
result = "'{thing}'".format(thing=as_escaped_unicode(thing))
else:
@@ -223,16 +234,17 @@ def as_escaped_unicode(thing, escape_chars=escape_chars):
return str(result) # always unicode
-def pprint_thing_encoded(object, encoding='utf-8', errors='replace', **kwds):
+def pprint_thing_encoded(object, encoding="utf-8", errors="replace", **kwds):
value = pprint_thing(object) # get unicode representation of object
return value.encode(encoding, errors, **kwds)
def _enable_data_resource_formatter(enable):
- if 'IPython' not in sys.modules:
+ if "IPython" not in sys.modules:
# definitely not in IPython
return
from IPython import get_ipython
+
ip = get_ipython()
if ip is None:
# still not in IPython
@@ -247,8 +259,9 @@ def _enable_data_resource_formatter(enable):
from IPython.core.formatters import BaseFormatter
class TableSchemaFormatter(BaseFormatter):
- print_method = '_repr_data_resource_'
+ print_method = "_repr_data_resource_"
_return_type = (dict,)
+
# register it:
formatters[mimetype] = TableSchemaFormatter()
# enable it if it's been disabled:
@@ -259,13 +272,19 @@ class TableSchemaFormatter(BaseFormatter):
formatters[mimetype].enabled = False
-default_pprint = lambda x, max_seq_items=None: \
- pprint_thing(x, escape_chars=('\t', '\r', '\n'), quote_strings=True,
- max_seq_items=max_seq_items)
+default_pprint = lambda x, max_seq_items=None: pprint_thing(
+ x, escape_chars=("\t", "\r", "\n"), quote_strings=True, max_seq_items=max_seq_items
+)
-def format_object_summary(obj, formatter, is_justify=True, name=None,
- indent_for_name=True, line_break_each_value=False):
+def format_object_summary(
+ obj,
+ formatter,
+ is_justify=True,
+ name=None,
+ indent_for_name=True,
+ line_break_each_value=False,
+):
"""
Return the formatted obj as a unicode string
@@ -299,14 +318,14 @@ def format_object_summary(obj, formatter, is_justify=True, name=None,
display_width, _ = get_console_size()
if display_width is None:
- display_width = get_option('display.width') or 80
+ display_width = get_option("display.width") or 80
if name is None:
name = obj.__class__.__name__
if indent_for_name:
name_len = len(name)
- space1 = "\n%s" % (' ' * (name_len + 1))
- space2 = "\n%s" % (' ' * (name_len + 2))
+ space1 = "\n%s" % (" " * (name_len + 1))
+ space2 = "\n%s" % (" " * (name_len + 2))
else:
space1 = "\n"
space2 = "\n " # space for the opening '['
@@ -315,10 +334,10 @@ def format_object_summary(obj, formatter, is_justify=True, name=None,
if line_break_each_value:
# If we want to vertically align on each value of obj, we need to
# separate values by a line break and indent the values
- sep = ',\n ' + ' ' * len(name)
+ sep = ",\n " + " " * len(name)
else:
- sep = ','
- max_seq_items = get_option('display.max_seq_items') or n
+ sep = ","
+ max_seq_items = get_option("display.max_seq_items") or n
# are we a truncated display
is_truncated = n > max_seq_items
@@ -328,8 +347,7 @@ def format_object_summary(obj, formatter, is_justify=True, name=None,
def _extend_line(s, line, value, display_width, next_line_prefix):
- if (adj.len(line.rstrip()) + adj.len(value.rstrip()) >=
- display_width):
+ if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width:
s += line.rstrip()
line = next_line_prefix
line += value
@@ -341,17 +359,17 @@ def best_len(values):
else:
return 0
- close = ', '
+ close = ", "
if n == 0:
- summary = '[]{}'.format(close)
+ summary = "[]{}".format(close)
elif n == 1 and not line_break_each_value:
first = formatter(obj[0])
- summary = '[{}]{}'.format(first, close)
+ summary = "[{}]{}".format(first, close)
elif n == 2 and not line_break_each_value:
first = formatter(obj[0])
last = formatter(obj[-1])
- summary = '[{}, {}]{}'.format(first, last, close)
+ summary = "[{}, {}]{}".format(first, last, close)
else:
if n > max_seq_items:
@@ -369,8 +387,10 @@ def best_len(values):
# strings will right align when head and tail are stacked
# vertically.
head, tail = _justify(head, tail)
- elif (is_truncated or not (len(', '.join(head)) < display_width and
- len(', '.join(tail)) < display_width)):
+ elif is_truncated or not (
+ len(", ".join(head)) < display_width
+ and len(", ".join(tail)) < display_width
+ ):
# Each string in head and tail should align with each other
max_length = max(best_len(head), best_len(tail))
head = [x.rjust(max_length) for x in head]
@@ -396,37 +416,34 @@ def best_len(values):
line = space2
for max_items in range(len(head)):
- word = head[max_items] + sep + ' '
- summary, line = _extend_line(summary, line, word,
- display_width, space2)
+ word = head[max_items] + sep + " "
+ summary, line = _extend_line(summary, line, word, display_width, space2)
if is_truncated:
# remove trailing space of last line
- summary += line.rstrip() + space2 + '...'
+ summary += line.rstrip() + space2 + "..."
line = space2
for max_items in range(len(tail) - 1):
- word = tail[max_items] + sep + ' '
- summary, line = _extend_line(summary, line, word,
- display_width, space2)
+ word = tail[max_items] + sep + " "
+ summary, line = _extend_line(summary, line, word, display_width, space2)
# last value: no sep added + 1 space of width used for trailing ','
- summary, line = _extend_line(summary, line, tail[-1],
- display_width - 2, space2)
+ summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2)
summary += line
# right now close is either '' or ', '
# Now we want to include the ']', but not the maybe space.
- close = ']' + close.rstrip(' ')
+ close = "]" + close.rstrip(" ")
summary += close
if len(summary) > (display_width) or line_break_each_value:
summary += space1
else: # one row
- summary += ' '
+ summary += " "
# remove initial space
- summary = '[' + summary[len(space2):]
+ summary = "[" + summary[len(space2) :]
return summary
@@ -461,10 +478,12 @@ def _justify(head, tail):
max_length = [max(x, y) for x, y in zip(max_length, length)]
# justify each item in each list-like in head and tail using max_length
- head = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length))
- for seq in head]
- tail = [tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length))
- for seq in tail]
+ head = [
+ tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in head
+ ]
+ tail = [
+ tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in tail
+ ]
return head, tail
@@ -486,13 +505,13 @@ def format_object_attrs(obj, include_dtype=True):
"""
attrs = []
- if hasattr(obj, 'dtype') and include_dtype:
- attrs.append(('dtype', "'{}'".format(obj.dtype)))
- if getattr(obj, 'name', None) is not None:
- attrs.append(('name', default_pprint(obj.name)))
- elif getattr(obj, 'names', None) is not None and any(obj.names):
- attrs.append(('names', default_pprint(obj.names)))
- max_seq_items = get_option('display.max_seq_items') or len(obj)
+ if hasattr(obj, "dtype") and include_dtype:
+ attrs.append(("dtype", "'{}'".format(obj.dtype)))
+ if getattr(obj, "name", None) is not None:
+ attrs.append(("name", default_pprint(obj.name)))
+ elif getattr(obj, "names", None) is not None and any(obj.names):
+ attrs.append(("names", default_pprint(obj.names)))
+ max_seq_items = get_option("display.max_seq_items") or len(obj)
if len(obj) > max_seq_items:
- attrs.append(('length', len(obj)))
+ attrs.append(("length", len(obj)))
return attrs
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
index 0d9b5fe4314a3..e7aa5d22995c6 100644
--- a/pandas/io/formats/style.py
+++ b/pandas/io/formats/style.py
@@ -26,14 +26,13 @@
from pandas.core.generic import _shared_docs
from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice
-jinja2 = import_optional_dependency(
- "jinja2", extra="DataFrame.style requires jinja2."
-)
+jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.")
try:
import matplotlib.pyplot as plt
from matplotlib import colors
+
has_mpl = True
except ImportError:
has_mpl = False
@@ -108,15 +107,21 @@ class Styler:
* Blank cells include ``blank``
* Data cells include ``data``
"""
+
loader = jinja2.PackageLoader("pandas", "io/formats/templates")
- env = jinja2.Environment(
- loader=loader,
- trim_blocks=True,
- )
+ env = jinja2.Environment(loader=loader, trim_blocks=True)
template = env.get_template("html.tpl")
- def __init__(self, data, precision=None, table_styles=None, uuid=None,
- caption=None, table_attributes=None, cell_ids=True):
+ def __init__(
+ self,
+ data,
+ precision=None,
+ table_styles=None,
+ uuid=None,
+ caption=None,
+ table_attributes=None,
+ cell_ids=True,
+ ):
self.ctx = defaultdict(list)
self._todo = []
@@ -135,7 +140,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None,
self.table_styles = table_styles
self.caption = caption
if precision is None:
- precision = get_option('display.precision')
+ precision = get_option("display.precision")
self.precision = precision
self.table_attributes = table_attributes
self.hidden_index = False
@@ -146,7 +151,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None,
def default_display_func(x):
if is_float(x):
- return '{:>.{precision}g}'.format(x, precision=self.precision)
+ return "{:>.{precision}g}".format(x, precision=self.precision)
else:
return x
@@ -158,29 +163,59 @@ def _repr_html_(self):
"""
return self.render()
- @Appender(_shared_docs['to_excel'] % dict(
- axes='index, columns', klass='Styler',
- axes_single_arg="{0 or 'index', 1 or 'columns'}",
- optional_by="""
+ @Appender(
+ _shared_docs["to_excel"]
+ % dict(
+ axes="index, columns",
+ klass="Styler",
+ axes_single_arg="{0 or 'index', 1 or 'columns'}",
+ optional_by="""
by : str or list of str
Name or list of names which refer to the axis items.""",
- versionadded_to_excel='\n .. versionadded:: 0.20'))
- def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
- float_format=None, columns=None, header=True, index=True,
- index_label=None, startrow=0, startcol=0, engine=None,
- merge_cells=True, encoding=None, inf_rep='inf', verbose=True,
- freeze_panes=None):
+ versionadded_to_excel="\n .. versionadded:: 0.20",
+ )
+ )
+ def to_excel(
+ self,
+ excel_writer,
+ sheet_name="Sheet1",
+ na_rep="",
+ float_format=None,
+ columns=None,
+ header=True,
+ index=True,
+ index_label=None,
+ startrow=0,
+ startcol=0,
+ engine=None,
+ merge_cells=True,
+ encoding=None,
+ inf_rep="inf",
+ verbose=True,
+ freeze_panes=None,
+ ):
from pandas.io.formats.excel import ExcelFormatter
- formatter = ExcelFormatter(self, na_rep=na_rep, cols=columns,
- header=header,
- float_format=float_format, index=index,
- index_label=index_label,
- merge_cells=merge_cells,
- inf_rep=inf_rep)
- formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow,
- startcol=startcol, freeze_panes=freeze_panes,
- engine=engine)
+
+ formatter = ExcelFormatter(
+ self,
+ na_rep=na_rep,
+ cols=columns,
+ header=header,
+ float_format=float_format,
+ index=index,
+ index_label=index_label,
+ merge_cells=merge_cells,
+ inf_rep=inf_rep,
+ )
+ formatter.write(
+ excel_writer,
+ sheet_name=sheet_name,
+ startrow=startrow,
+ startcol=startcol,
+ freeze_panes=freeze_panes,
+ engine=engine,
+ )
def _translate(self):
"""
@@ -227,29 +262,43 @@ def format_attr(pair):
for r in range(n_clvls):
# Blank for Index columns...
- row_es = [{"type": "th",
- "value": BLANK_VALUE,
- "display_value": BLANK_VALUE,
- "is_visible": not hidden_index,
- "class": " ".join([BLANK_CLASS])}] * (n_rlvls - 1)
+ row_es = [
+ {
+ "type": "th",
+ "value": BLANK_VALUE,
+ "display_value": BLANK_VALUE,
+ "is_visible": not hidden_index,
+ "class": " ".join([BLANK_CLASS]),
+ }
+ ] * (n_rlvls - 1)
# ... except maybe the last for columns.names
name = self.data.columns.names[r]
- cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS,
- "level{lvl}".format(lvl=r)]
+ cs = [
+ BLANK_CLASS if name is None else INDEX_NAME_CLASS,
+ "level{lvl}".format(lvl=r),
+ ]
name = BLANK_VALUE if name is None else name
- row_es.append({"type": "th",
- "value": name,
- "display_value": name,
- "class": " ".join(cs),
- "is_visible": not hidden_index})
+ row_es.append(
+ {
+ "type": "th",
+ "value": name,
+ "display_value": name,
+ "class": " ".join(cs),
+ "is_visible": not hidden_index,
+ }
+ )
if clabels:
for c, value in enumerate(clabels[r]):
- cs = [COL_HEADING_CLASS, "level{lvl}".format(lvl=r),
- "col{col}".format(col=c)]
- cs.extend(cell_context.get(
- "col_headings", {}).get(r, {}).get(c, []))
+ cs = [
+ COL_HEADING_CLASS,
+ "level{lvl}".format(lvl=r),
+ "col{col}".format(col=c),
+ ]
+ cs.extend(
+ cell_context.get("col_headings", {}).get(r, {}).get(c, [])
+ )
es = {
"type": "th",
"value": value,
@@ -265,23 +314,24 @@ def format_attr(pair):
row_es.append(es)
head.append(row_es)
- if (self.data.index.names and
- com._any_not_none(*self.data.index.names) and
- not hidden_index):
+ if (
+ self.data.index.names
+ and com._any_not_none(*self.data.index.names)
+ and not hidden_index
+ ):
index_header_row = []
for c, name in enumerate(self.data.index.names):
- cs = [INDEX_NAME_CLASS,
- "level{lvl}".format(lvl=c)]
- name = '' if name is None else name
- index_header_row.append({"type": "th", "value": name,
- "class": " ".join(cs)})
+ cs = [INDEX_NAME_CLASS, "level{lvl}".format(lvl=c)]
+ name = "" if name is None else name
+ index_header_row.append(
+ {"type": "th", "value": name, "class": " ".join(cs)}
+ )
index_header_row.extend(
- [{"type": "th",
- "value": BLANK_VALUE,
- "class": " ".join([BLANK_CLASS])
- }] * (len(clabels[0]) - len(hidden_columns)))
+ [{"type": "th", "value": BLANK_VALUE, "class": " ".join([BLANK_CLASS])}]
+ * (len(clabels[0]) - len(hidden_columns))
+ )
head.append(index_header_row)
@@ -289,16 +339,18 @@ def format_attr(pair):
for r, idx in enumerate(self.data.index):
row_es = []
for c, value in enumerate(rlabels[r]):
- rid = [ROW_HEADING_CLASS, "level{lvl}".format(lvl=c),
- "row{row}".format(row=r)]
+ rid = [
+ ROW_HEADING_CLASS,
+ "level{lvl}".format(lvl=c),
+ "row{row}".format(row=r),
+ ]
es = {
"type": "th",
- "is_visible": (_is_visible(r, c, idx_lengths) and
- not hidden_index),
+ "is_visible": (_is_visible(r, c, idx_lengths) and not hidden_index),
"value": value,
"display_value": value,
"id": "_".join(rid[1:]),
- "class": " ".join(rid)
+ "class": " ".join(rid),
}
rowspan = idx_lengths.get((c, r), 0)
if rowspan > 1:
@@ -308,19 +360,19 @@ def format_attr(pair):
row_es.append(es)
for c, col in enumerate(self.data.columns):
- cs = [DATA_CLASS, "row{row}".format(row=r),
- "col{col}".format(col=c)]
+ cs = [DATA_CLASS, "row{row}".format(row=r), "col{col}".format(col=c)]
cs.extend(cell_context.get("data", {}).get(r, {}).get(c, []))
formatter = self._display_funcs[(r, c)]
value = self.data.iloc[r, c]
- row_dict = {"type": "td",
- "value": value,
- "class": " ".join(cs),
- "display_value": formatter(value),
- "is_visible": (c not in hidden_columns)}
+ row_dict = {
+ "type": "td",
+ "value": value,
+ "class": " ".join(cs),
+ "display_value": formatter(value),
+ "is_visible": (c not in hidden_columns),
+ }
# only add an id if the cell has a style
- if (self.cell_ids or
- not(len(ctx[r, c]) == 1 and ctx[r, c][0] == '')):
+ if self.cell_ids or not (len(ctx[r, c]) == 1 and ctx[r, c][0] == ""):
row_dict["id"] = "_".join(cs[1:])
row_es.append(row_dict)
props = []
@@ -329,25 +381,34 @@ def format_attr(pair):
if x.count(":"):
props.append(x.split(":"))
else:
- props.append(['', ''])
- cellstyle.append({'props': props,
- 'selector': "row{row}_col{col}"
- .format(row=r, col=c)})
+ props.append(["", ""])
+ cellstyle.append(
+ {
+ "props": props,
+ "selector": "row{row}_col{col}".format(row=r, col=c),
+ }
+ )
body.append(row_es)
table_attr = self.table_attributes
use_mathjax = get_option("display.html.use_mathjax")
if not use_mathjax:
- table_attr = table_attr or ''
+ table_attr = table_attr or ""
if 'class="' in table_attr:
- table_attr = table_attr.replace('class="',
- 'class="tex2jax_ignore ')
+ table_attr = table_attr.replace('class="', 'class="tex2jax_ignore ')
else:
table_attr += ' class="tex2jax_ignore"'
- return dict(head=head, cellstyle=cellstyle, body=body, uuid=uuid,
- precision=precision, table_styles=table_styles,
- caption=caption, table_attributes=table_attr)
+ return dict(
+ head=head,
+ cellstyle=cellstyle,
+ body=body,
+ uuid=uuid,
+ precision=precision,
+ table_styles=table_styles,
+ caption=caption,
+ table_attributes=table_attr,
+ )
def format(self, formatter, subset=None):
"""
@@ -460,9 +521,8 @@ def render(self, **kwargs):
# filter out empty styles, every cell will have a class
# but the list of props may just be [['', '']].
# so we have the neested anys below
- trimmed = [x for x in d['cellstyle']
- if any(any(y) for y in x['props'])]
- d['cellstyle'] = trimmed
+ trimmed = [x for x in d["cellstyle"] if any(any(y) for y in x["props"])]
+ d["cellstyle"] = trimmed
d.update(kwargs)
return self.template.render(**d)
@@ -485,9 +545,13 @@ def _update_ctx(self, attrs):
self.ctx[(i, j)].append(pair)
def _copy(self, deepcopy=False):
- styler = Styler(self.data, precision=self.precision,
- caption=self.caption, uuid=self.uuid,
- table_styles=self.table_styles)
+ styler = Styler(
+ self.data,
+ precision=self.precision,
+ caption=self.caption,
+ uuid=self.uuid,
+ table_styles=self.table_styles,
+ )
if deepcopy:
styler.ctx = copy.deepcopy(self.ctx)
styler._todo = copy.deepcopy(self._todo)
@@ -532,30 +596,34 @@ def _apply(self, func, axis=0, subset=None, **kwargs):
subset = _non_reducing_slice(subset)
data = self.data.loc[subset]
if axis is not None:
- result = data.apply(func, axis=axis,
- result_type='expand', **kwargs)
+ result = data.apply(func, axis=axis, result_type="expand", **kwargs)
result.columns = data.columns
else:
result = func(data, **kwargs)
if not isinstance(result, pd.DataFrame):
raise TypeError(
"Function {func!r} must return a DataFrame when "
- "passed to `Styler.apply` with axis=None"
- .format(func=func))
- if not (result.index.equals(data.index) and
- result.columns.equals(data.columns)):
- msg = ('Result of {func!r} must have identical index and '
- 'columns as the input'.format(func=func))
+ "passed to `Styler.apply` with axis=None".format(func=func)
+ )
+ if not (
+ result.index.equals(data.index) and result.columns.equals(data.columns)
+ ):
+ msg = (
+ "Result of {func!r} must have identical index and "
+ "columns as the input".format(func=func)
+ )
raise ValueError(msg)
result_shape = result.shape
expected_shape = self.data.loc[subset].shape
if result_shape != expected_shape:
- msg = ("Function {func!r} returned the wrong shape.\n"
- "Result has shape: {res}\n"
- "Expected shape: {expect}".format(func=func,
- res=result.shape,
- expect=expected_shape))
+ msg = (
+ "Function {func!r} returned the wrong shape.\n"
+ "Result has shape: {res}\n"
+ "Expected shape: {expect}".format(
+ func=func, res=result.shape, expect=expected_shape
+ )
+ )
raise ValueError(msg)
self._update_ctx(result)
return self
@@ -605,8 +673,9 @@ def apply(self, func, axis=0, subset=None, **kwargs):
>>> df = pd.DataFrame(np.random.randn(5, 2))
>>> df.style.apply(highlight_max)
"""
- self._todo.append((lambda instance: getattr(instance, '_apply'),
- (func, axis, subset), kwargs))
+ self._todo.append(
+ (lambda instance: getattr(instance, "_apply"), (func, axis, subset), kwargs)
+ )
return self
def _applymap(self, func, subset=None, **kwargs):
@@ -641,8 +710,9 @@ def applymap(self, func, subset=None, **kwargs):
--------
Styler.where
"""
- self._todo.append((lambda instance: getattr(instance, '_applymap'),
- (func, subset), kwargs))
+ self._todo.append(
+ (lambda instance: getattr(instance, "_applymap"), (func, subset), kwargs)
+ )
return self
def where(self, cond, value, other=None, subset=None, **kwargs):
@@ -677,10 +747,11 @@ def where(self, cond, value, other=None, subset=None, **kwargs):
"""
if other is None:
- other = ''
+ other = ""
- return self.applymap(lambda val: value if cond(val) else other,
- subset=subset, **kwargs)
+ return self.applymap(
+ lambda val: value if cond(val) else other, subset=subset, **kwargs
+ )
def set_precision(self, precision):
"""
@@ -858,10 +929,11 @@ def hide_columns(self, subset):
@staticmethod
def _highlight_null(v, null_color):
- return ('background-color: {color}'.format(color=null_color)
- if pd.isna(v) else '')
+ return (
+ "background-color: {color}".format(color=null_color) if pd.isna(v) else ""
+ )
- def highlight_null(self, null_color='red'):
+ def highlight_null(self, null_color="red"):
"""
Shade the background ``null_color`` for missing values.
@@ -876,8 +948,15 @@ def highlight_null(self, null_color='red'):
self.applymap(self._highlight_null, null_color=null_color)
return self
- def background_gradient(self, cmap='PuBu', low=0, high=0, axis=0,
- subset=None, text_color_threshold=0.408):
+ def background_gradient(
+ self,
+ cmap="PuBu",
+ low=0,
+ high=0,
+ axis=0,
+ subset=None,
+ text_color_threshold=0.408,
+ ):
"""
Color the background in a gradient according to
the data in each column (optionally row).
@@ -921,19 +1000,26 @@ def background_gradient(self, cmap='PuBu', low=0, high=0, axis=0,
"""
subset = _maybe_numeric_slice(self.data, subset)
subset = _non_reducing_slice(subset)
- self.apply(self._background_gradient, cmap=cmap, subset=subset,
- axis=axis, low=low, high=high,
- text_color_threshold=text_color_threshold)
+ self.apply(
+ self._background_gradient,
+ cmap=cmap,
+ subset=subset,
+ axis=axis,
+ low=low,
+ high=high,
+ text_color_threshold=text_color_threshold,
+ )
return self
@staticmethod
- def _background_gradient(s, cmap='PuBu', low=0, high=0,
- text_color_threshold=0.408):
+ def _background_gradient(s, cmap="PuBu", low=0, high=0, text_color_threshold=0.408):
"""
Color background in a range according to the data.
"""
- if (not isinstance(text_color_threshold, (float, int)) or
- not 0 <= text_color_threshold <= 1):
+ if (
+ not isinstance(text_color_threshold, (float, int))
+ or not 0 <= text_color_threshold <= 1
+ ):
msg = "`text_color_threshold` must be a value from 0 to 1."
raise ValueError(msg)
@@ -971,8 +1057,8 @@ def relative_luminance(rgba):
def css(rgba):
dark = relative_luminance(rgba) < text_color_threshold
- text_color = '#f1f1f1' if dark else '#000000'
- return 'background-color: {b};color: {c};'.format(
+ text_color = "#f1f1f1" if dark else "#000000"
+ return "background-color: {b};color: {c};".format(
b=colors.rgb2hex(rgba), c=text_color
)
@@ -981,7 +1067,8 @@ def css(rgba):
else:
return pd.DataFrame(
[[css(rgba) for rgba in row] for row in rgbas],
- index=s.index, columns=s.columns
+ index=s.index,
+ columns=s.columns,
)
def set_properties(self, subset=None, **kwargs):
@@ -1006,8 +1093,7 @@ def set_properties(self, subset=None, **kwargs):
>>> df.style.set_properties(color="white", align="right")
>>> df.style.set_properties(**{'background-color': 'yellow'})
"""
- values = ';'.join('{p}: {v}'.format(p=p, v=v)
- for p, v in kwargs.items())
+ values = ";".join("{p}: {v}".format(p=p, v=v) for p, v in kwargs.items())
f = lambda x: values
return self.applymap(f, subset=subset)
@@ -1023,10 +1109,10 @@ def _bar(s, align, colors, width=100, vmin=None, vmax=None):
smax = s.max() if vmax is None else vmax
if isinstance(smax, ABCSeries):
smax = smax.max()
- if align == 'mid':
+ if align == "mid":
smin = min(0, smin)
smax = max(0, smax)
- elif align == 'zero':
+ elif align == "zero":
# For "zero" mode, we want the range to be symmetrical around zero.
smax = max(abs(smin), abs(smax))
smin = -smax
@@ -1038,26 +1124,26 @@ def css_bar(start, end, color):
"""
Generate CSS code to draw a bar from start to end.
"""
- css = 'width: 10em; height: 80%;'
+ css = "width: 10em; height: 80%;"
if end > start:
- css += 'background: linear-gradient(90deg,'
+ css += "background: linear-gradient(90deg,"
if start > 0:
- css += ' transparent {s:.1f}%, {c} {s:.1f}%, '.format(
+ css += " transparent {s:.1f}%, {c} {s:.1f}%, ".format(
s=start, c=color
)
- css += '{c} {e:.1f}%, transparent {e:.1f}%)'.format(
- e=min(end, width), c=color,
+ css += "{c} {e:.1f}%, transparent {e:.1f}%)".format(
+ e=min(end, width), c=color
)
return css
def css(x):
if pd.isna(x):
- return ''
+ return ""
# avoid deprecated indexing `colors[x > zero]`
color = colors[1] if x > zero else colors[0]
- if align == 'left':
+ if align == "left":
return css_bar(0, x, color)
else:
return css_bar(min(x, zero), max(x, zero), color)
@@ -1067,11 +1153,20 @@ def css(x):
else:
return pd.DataFrame(
[[css(x) for x in row] for row in normed],
- index=s.index, columns=s.columns
+ index=s.index,
+ columns=s.columns,
)
- def bar(self, subset=None, axis=0, color='#d65f5f', width=100,
- align='left', vmin=None, vmax=None):
+ def bar(
+ self,
+ subset=None,
+ axis=0,
+ color="#d65f5f",
+ width=100,
+ align="left",
+ vmin=None,
+ vmax=None,
+ ):
"""
Draw bar chart in the cell backgrounds.
@@ -1120,7 +1215,7 @@ def bar(self, subset=None, axis=0, color='#d65f5f', width=100,
-------
self : Styler
"""
- if align not in ('left', 'zero', 'mid'):
+ if align not in ("left", "zero", "mid"):
raise ValueError("`align` must be one of {'left', 'zero',' mid'}")
if not (is_list_like(color)):
@@ -1128,19 +1223,28 @@ def bar(self, subset=None, axis=0, color='#d65f5f', width=100,
elif len(color) == 1:
color = [color[0], color[0]]
elif len(color) > 2:
- raise ValueError("`color` must be string or a list-like"
- " of length 2: [`color_neg`, `color_pos`]"
- " (eg: color=['#d65f5f', '#5fba7d'])")
+ raise ValueError(
+ "`color` must be string or a list-like"
+ " of length 2: [`color_neg`, `color_pos`]"
+ " (eg: color=['#d65f5f', '#5fba7d'])"
+ )
subset = _maybe_numeric_slice(self.data, subset)
subset = _non_reducing_slice(subset)
- self.apply(self._bar, subset=subset, axis=axis,
- align=align, colors=color, width=width,
- vmin=vmin, vmax=vmax)
+ self.apply(
+ self._bar,
+ subset=subset,
+ axis=axis,
+ align=align,
+ colors=color,
+ width=width,
+ vmin=vmin,
+ vmax=vmax,
+ )
return self
- def highlight_max(self, subset=None, color='yellow', axis=0):
+ def highlight_max(self, subset=None, color="yellow", axis=0):
"""
Highlight the maximum by shading the background.
@@ -1158,10 +1262,9 @@ def highlight_max(self, subset=None, color='yellow', axis=0):
-------
self : Styler
"""
- return self._highlight_handler(subset=subset, color=color, axis=axis,
- max_=True)
+ return self._highlight_handler(subset=subset, color=color, axis=axis, max_=True)
- def highlight_min(self, subset=None, color='yellow', axis=0):
+ def highlight_min(self, subset=None, color="yellow", axis=0):
"""
Highlight the minimum by shading the background.
@@ -1179,35 +1282,37 @@ def highlight_min(self, subset=None, color='yellow', axis=0):
-------
self : Styler
"""
- return self._highlight_handler(subset=subset, color=color, axis=axis,
- max_=False)
+ return self._highlight_handler(
+ subset=subset, color=color, axis=axis, max_=False
+ )
- def _highlight_handler(self, subset=None, color='yellow', axis=None,
- max_=True):
+ def _highlight_handler(self, subset=None, color="yellow", axis=None, max_=True):
subset = _non_reducing_slice(_maybe_numeric_slice(self.data, subset))
- self.apply(self._highlight_extrema, color=color, axis=axis,
- subset=subset, max_=max_)
+ self.apply(
+ self._highlight_extrema, color=color, axis=axis, subset=subset, max_=max_
+ )
return self
@staticmethod
- def _highlight_extrema(data, color='yellow', max_=True):
+ def _highlight_extrema(data, color="yellow", max_=True):
"""
Highlight the min or max in a Series or DataFrame.
"""
- attr = 'background-color: {0}'.format(color)
+ attr = "background-color: {0}".format(color)
if data.ndim == 1: # Series from .apply
if max_:
extrema = data == data.max()
else:
extrema = data == data.min()
- return [attr if v else '' for v in extrema]
+ return [attr if v else "" for v in extrema]
else: # DataFrame from .tee
if max_:
extrema = data == data.max().max()
else:
extrema = data == data.min().min()
- return pd.DataFrame(np.where(extrema, attr, ''),
- index=data.index, columns=data.columns)
+ return pd.DataFrame(
+ np.where(extrema, attr, ""), index=data.index, columns=data.columns
+ )
@classmethod
def from_custom_template(cls, searchpath, name):
@@ -1227,10 +1332,7 @@ def from_custom_template(cls, searchpath, name):
MyStyler : subclass of Styler
Has the correct ``env`` and ``template`` class attributes set.
"""
- loader = jinja2.ChoiceLoader([
- jinja2.FileSystemLoader(searchpath),
- cls.loader,
- ])
+ loader = jinja2.ChoiceLoader([jinja2.FileSystemLoader(searchpath), cls.loader])
class MyStyler(cls):
env = jinja2.Environment(loader=loader)
@@ -1333,27 +1435,28 @@ def _get_level_lengths(index, hidden_elements=None):
lengths = {}
if index.nlevels == 1:
for i, value in enumerate(levels):
- if(i not in hidden_elements):
+ if i not in hidden_elements:
lengths[(0, i)] = 1
return lengths
for i, lvl in enumerate(levels):
for j, row in enumerate(lvl):
- if not get_option('display.multi_sparse'):
+ if not get_option("display.multi_sparse"):
lengths[(i, j)] = 1
elif (row != sentinel) and (j not in hidden_elements):
last_label = j
lengths[(i, last_label)] = 1
- elif (row != sentinel):
+ elif row != sentinel:
# even if its hidden, keep track of it in case
# length >1 and later elements are visible
last_label = j
lengths[(i, last_label)] = 0
- elif(j not in hidden_elements):
+ elif j not in hidden_elements:
lengths[(i, last_label)] += 1
non_zero_lengths = {
- element: length for element, length in lengths.items() if length >= 1}
+ element: length for element, length in lengths.items() if length >= 1
+ }
return non_zero_lengths
@@ -1364,6 +1467,8 @@ def _maybe_wrap_formatter(formatter):
elif callable(formatter):
return formatter
else:
- msg = ("Expected a template string or callable, got {formatter} "
- "instead".format(formatter=formatter))
+ msg = (
+ "Expected a template string or callable, got {formatter} "
+ "instead".format(formatter=formatter)
+ )
raise TypeError(msg)
diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py
index a9eff003f2249..d29078cad9318 100644
--- a/pandas/io/gbq.py
+++ b/pandas/io/gbq.py
@@ -9,17 +9,25 @@ def _try_import():
"pandas-gbq is required to load data from Google BigQuery. "
"See the docs: https://pandas-gbq.readthedocs.io."
)
- pandas_gbq = import_optional_dependency(
- "pandas_gbq",
- extra=msg,
- )
+ pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg)
return pandas_gbq
-def read_gbq(query, project_id=None, index_col=None, col_order=None,
- reauth=False, auth_local_webserver=False, dialect=None,
- location=None, configuration=None, credentials=None,
- use_bqstorage_api=None, private_key=None, verbose=None):
+def read_gbq(
+ query,
+ project_id=None,
+ index_col=None,
+ col_order=None,
+ reauth=False,
+ auth_local_webserver=False,
+ dialect=None,
+ location=None,
+ configuration=None,
+ credentials=None,
+ use_bqstorage_api=None,
+ private_key=None,
+ verbose=None,
+):
"""
Load data from Google BigQuery.
@@ -155,21 +163,48 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None,
# END: deprecated kwargs
return pandas_gbq.read_gbq(
- query, project_id=project_id, index_col=index_col,
- col_order=col_order, reauth=reauth,
- auth_local_webserver=auth_local_webserver, dialect=dialect,
- location=location, configuration=configuration,
- credentials=credentials, **kwargs)
+ query,
+ project_id=project_id,
+ index_col=index_col,
+ col_order=col_order,
+ reauth=reauth,
+ auth_local_webserver=auth_local_webserver,
+ dialect=dialect,
+ location=location,
+ configuration=configuration,
+ credentials=credentials,
+ **kwargs
+ )
-def to_gbq(dataframe, destination_table, project_id=None, chunksize=None,
- reauth=False, if_exists='fail', auth_local_webserver=False,
- table_schema=None, location=None, progress_bar=True,
- credentials=None, verbose=None, private_key=None):
+def to_gbq(
+ dataframe,
+ destination_table,
+ project_id=None,
+ chunksize=None,
+ reauth=False,
+ if_exists="fail",
+ auth_local_webserver=False,
+ table_schema=None,
+ location=None,
+ progress_bar=True,
+ credentials=None,
+ verbose=None,
+ private_key=None,
+):
pandas_gbq = _try_import()
- pandas_gbq.to_gbq(dataframe, destination_table, project_id=project_id,
- chunksize=chunksize, reauth=reauth, if_exists=if_exists,
- auth_local_webserver=auth_local_webserver,
- table_schema=table_schema, location=location,
- progress_bar=progress_bar, credentials=credentials,
- verbose=verbose, private_key=private_key)
+ pandas_gbq.to_gbq(
+ dataframe,
+ destination_table,
+ project_id=project_id,
+ chunksize=chunksize,
+ reauth=reauth,
+ if_exists=if_exists,
+ auth_local_webserver=auth_local_webserver,
+ table_schema=table_schema,
+ location=location,
+ progress_bar=progress_bar,
+ credentials=credentials,
+ verbose=verbose,
+ private_key=private_key,
+ )
diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py
index 862ccbb291c01..1f5e0faedc6d2 100644
--- a/pandas/io/gcs.py
+++ b/pandas/io/gcs.py
@@ -2,18 +2,17 @@
from pandas.compat._optional import import_optional_dependency
gcsfs = import_optional_dependency(
- "gcsfs",
- extra="The gcsfs library is required to handle GCS files"
+ "gcsfs", extra="The gcsfs library is required to handle GCS files"
)
-def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
- compression=None, mode=None):
+def get_filepath_or_buffer(
+ filepath_or_buffer, encoding=None, compression=None, mode=None
+):
if mode is None:
- mode = 'rb'
+ mode = "rb"
fs = gcsfs.GCSFileSystem()
- filepath_or_buffer = fs.open(
- filepath_or_buffer, mode)
+ filepath_or_buffer = fs.open(filepath_or_buffer, mode)
return filepath_or_buffer, None, compression, True
diff --git a/pandas/io/html.py b/pandas/io/html.py
index f080e1d1fc188..91f5e5a949ac3 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -35,16 +35,17 @@ def _importers():
return
global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
- bs4 = import_optional_dependency("bs4", raise_on_missing=False,
- on_version="ignore")
+ bs4 = import_optional_dependency("bs4", raise_on_missing=False, on_version="ignore")
_HAS_BS4 = bs4 is not None
- lxml = import_optional_dependency("lxml.etree", raise_on_missing=False,
- on_version="ignore")
+ lxml = import_optional_dependency(
+ "lxml.etree", raise_on_missing=False, on_version="ignore"
+ )
_HAS_LXML = lxml is not None
- html5lib = import_optional_dependency("html5lib", raise_on_missing=False,
- on_version="ignore")
+ html5lib = import_optional_dependency(
+ "html5lib", raise_on_missing=False, on_version="ignore"
+ )
_HAS_HTML5LIB = html5lib is not None
_IMPORTS = True
@@ -53,7 +54,7 @@ def _importers():
#############
# READ HTML #
#############
-_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}')
+_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
def _remove_whitespace(s, regex=_RE_WHITESPACE):
@@ -72,7 +73,7 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE):
subd : str or unicode
`s` with all extra whitespace replaced with a single space.
"""
- return regex.sub(' ', s.strip())
+ return regex.sub(" ", s.strip())
def _get_skiprows(skiprows):
@@ -100,8 +101,9 @@ def _get_skiprows(skiprows):
return skiprows
elif skiprows is None:
return 0
- raise TypeError('%r is not a valid type for skipping rows' %
- type(skiprows).__name__)
+ raise TypeError(
+ "%r is not a valid type for skipping rows" % type(skiprows).__name__
+ )
def _read(obj):
@@ -118,13 +120,13 @@ def _read(obj):
if _is_url(obj):
with urlopen(obj) as url:
text = url.read()
- elif hasattr(obj, 'read'):
+ elif hasattr(obj, "read"):
text = obj.read()
elif isinstance(obj, (str, bytes)):
text = obj
try:
if os.path.isfile(text):
- with open(text, 'rb') as f:
+ with open(text, "rb") as f:
return f.read()
except (TypeError, ValueError):
pass
@@ -397,8 +399,7 @@ def _parse_thead_tbody_tfoot(self, table_html):
footer_rows = self._parse_tfoot_tr(table_html)
def row_is_all_th(row):
- return all(self._equals_tag(t, 'th') for t in
- self._parse_td(row))
+ return all(self._equals_tag(t, "th") for t in self._parse_td(row))
if not header_rows:
# The table has no . Move the top all- rows from
@@ -449,14 +450,13 @@ def _expand_colspan_rowspan(self, rows):
prev_i, prev_text, prev_rowspan = remainder.pop(0)
texts.append(prev_text)
if prev_rowspan > 1:
- next_remainder.append((prev_i, prev_text,
- prev_rowspan - 1))
+ next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
index += 1
# Append the text from this | , colspan times
text = _remove_whitespace(self._text_getter(td))
- rowspan = int(self._attr_getter(td, 'rowspan') or 1)
- colspan = int(self._attr_getter(td, 'colspan') or 1)
+ rowspan = int(self._attr_getter(td, "rowspan") or 1)
+ colspan = int(self._attr_getter(td, "colspan") or 1)
for _ in range(colspan):
texts.append(text)
@@ -468,8 +468,7 @@ def _expand_colspan_rowspan(self, rows):
for prev_i, prev_text, prev_rowspan in remainder:
texts.append(prev_text)
if prev_rowspan > 1:
- next_remainder.append((prev_i, prev_text,
- prev_rowspan - 1))
+ next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
all_texts.append(texts)
remainder = next_remainder
@@ -482,8 +481,7 @@ def _expand_colspan_rowspan(self, rows):
for prev_i, prev_text, prev_rowspan in remainder:
texts.append(prev_text)
if prev_rowspan > 1:
- next_remainder.append((prev_i, prev_text,
- prev_rowspan - 1))
+ next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
all_texts.append(texts)
remainder = next_remainder
@@ -508,8 +506,12 @@ def _handle_hidden_tables(self, tbl_list, attr_name):
if not self.displayed_only:
return tbl_list
- return [x for x in tbl_list if "display:none" not in
- getattr(x, attr_name).get('style', '').replace(" ", "")]
+ return [
+ x
+ for x in tbl_list
+ if "display:none"
+ not in getattr(x, attr_name).get("style", "").replace(" ", "")
+ ]
class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
@@ -529,14 +531,15 @@ class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
from bs4 import SoupStrainer
- self._strainer = SoupStrainer('table')
+
+ self._strainer = SoupStrainer("table")
def _parse_tables(self, doc, match, attrs):
element_name = self._strainer.name
tables = doc.find_all(element_name, attrs=attrs)
if not tables:
- raise ValueError('No tables found')
+ raise ValueError("No tables found")
result = []
unique_tables = set()
@@ -544,18 +547,17 @@ def _parse_tables(self, doc, match, attrs):
for table in tables:
if self.displayed_only:
- for elem in table.find_all(
- style=re.compile(r"display:\s*none")):
+ for elem in table.find_all(style=re.compile(r"display:\s*none")):
elem.decompose()
- if (table not in unique_tables and
- table.find(text=match) is not None):
+ if table not in unique_tables and table.find(text=match) is not None:
result.append(table)
unique_tables.add(table)
if not result:
- raise ValueError("No tables found matching pattern {patt!r}"
- .format(patt=match.pattern))
+ raise ValueError(
+ "No tables found matching pattern {patt!r}".format(patt=match.pattern)
+ )
return result
def _text_getter(self, obj):
@@ -565,31 +567,32 @@ def _equals_tag(self, obj, tag):
return obj.name == tag
def _parse_td(self, row):
- return row.find_all(('td', 'th'), recursive=False)
+ return row.find_all(("td", "th"), recursive=False)
def _parse_thead_tr(self, table):
- return table.select('thead tr')
+ return table.select("thead tr")
def _parse_tbody_tr(self, table):
- from_tbody = table.select('tbody tr')
- from_root = table.find_all('tr', recursive=False)
+ from_tbody = table.select("tbody tr")
+ from_root = table.find_all("tr", recursive=False)
# HTML spec: at most one of these lists has content
return from_tbody + from_root
def _parse_tfoot_tr(self, table):
- return table.select('tfoot tr')
+ return table.select("tfoot tr")
def _setup_build_doc(self):
raw_text = _read(self.io)
if not raw_text:
- raise ValueError('No text parsed from document: {doc}'
- .format(doc=self.io))
+ raise ValueError("No text parsed from document: {doc}".format(doc=self.io))
return raw_text
def _build_doc(self):
from bs4 import BeautifulSoup
- return BeautifulSoup(self._setup_build_doc(), features='html5lib',
- from_encoding=self.encoding)
+
+ return BeautifulSoup(
+ self._setup_build_doc(), features="html5lib", from_encoding=self.encoding
+ )
def _build_xpath_expr(attrs):
@@ -607,15 +610,15 @@ def _build_xpath_expr(attrs):
An XPath expression that checks for the given HTML attributes.
"""
# give class attribute as class_ because class is a python keyword
- if 'class_' in attrs:
- attrs['class'] = attrs.pop('class_')
+ if "class_" in attrs:
+ attrs["class"] = attrs.pop("class_")
s = ["@{key}={val!r}".format(key=k, val=v) for k, v in attrs.items()]
- return '[{expr}]'.format(expr=' and '.join(s))
+ return "[{expr}]".format(expr=" and ".join(s))
-_re_namespace = {'re': 'http://exslt.org/regular-expressions'}
-_valid_schemes = 'http', 'file', 'ftp'
+_re_namespace = {"re": "http://exslt.org/regular-expressions"}
+_valid_schemes = "http", "file", "ftp"
class _LxmlFrameParser(_HtmlFrameParser):
@@ -645,14 +648,14 @@ def _text_getter(self, obj):
def _parse_td(self, row):
# Look for direct children only: the "row" element here may be a
# or | (see _parse_thead_tr).
- return row.xpath('./td|./th')
+ return row.xpath("./td|./th")
def _parse_tables(self, doc, match, kwargs):
pattern = match.pattern
# 1. check all descendants for the given pattern and only search tables
# 2. go up the tree until we find a table
- query = '//table//*[re:test(text(), {patt!r})]/ancestor::table'
+ query = "//table//*[re:test(text(), {patt!r})]/ancestor::table"
xpath_expr = query.format(patt=pattern)
# if any table attributes were given build an xpath expression to
@@ -668,14 +671,14 @@ def _parse_tables(self, doc, match, kwargs):
# lxml utilizes XPATH 1.0 which does not have regex
# support. As a result, we find all elements with a style
# attribute and iterate them to check for display:none
- for elem in table.xpath('.//*[@style]'):
- if "display:none" in elem.attrib.get(
- "style", "").replace(" ", ""):
+ for elem in table.xpath(".//*[@style]"):
+ if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
elem.getparent().remove(elem)
if not tables:
- raise ValueError("No tables found matching regex {patt!r}"
- .format(patt=pattern))
+ raise ValueError(
+ "No tables found matching regex {patt!r}".format(patt=pattern)
+ )
return tables
def _equals_tag(self, obj, tag):
@@ -699,6 +702,7 @@ def _build_doc(self):
"""
from lxml.html import parse, fromstring, HTMLParser
from lxml.etree import XMLSyntaxError
+
parser = HTMLParser(recover=True, encoding=self.encoding)
try:
@@ -724,15 +728,15 @@ def _build_doc(self):
else:
raise e
else:
- if not hasattr(r, 'text_content'):
+ if not hasattr(r, "text_content"):
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
return r
def _parse_thead_tr(self, table):
rows = []
- for thead in table.xpath('.//thead'):
- rows.extend(thead.xpath('./tr'))
+ for thead in table.xpath(".//thead"):
+ rows.extend(thead.xpath("./tr"))
# HACK: lxml does not clean up the clearly-erroneous
# foo | bar | . (Missing ). Add
@@ -740,20 +744,20 @@ def _parse_thead_tr(self, table):
# children as though it's a
.
#
# Better solution would be to use html5lib.
- elements_at_root = thead.xpath('./td|./th')
+ elements_at_root = thead.xpath("./td|./th")
if elements_at_root:
rows.append(thead)
return rows
def _parse_tbody_tr(self, table):
- from_tbody = table.xpath('.//tbody//tr')
- from_root = table.xpath('./tr')
+ from_tbody = table.xpath(".//tbody//tr")
+ from_root = table.xpath("./tr")
# HTML spec: at most one of these lists has content
return from_tbody + from_root
def _parse_tfoot_tr(self, table):
- return table.xpath('.//tfoot//tr')
+ return table.xpath(".//tfoot//tr")
def _expand_elements(body):
@@ -761,15 +765,15 @@ def _expand_elements(body):
lens_max = lens.max()
not_max = lens[lens != lens_max]
- empty = ['']
+ empty = [""]
for ind, length in not_max.items():
body[ind] += empty * (lens_max - length)
def _data_to_frame(**kwargs):
- head, body, foot = kwargs.pop('data')
- header = kwargs.pop('header')
- kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
+ head, body, foot = kwargs.pop("data")
+ header = kwargs.pop("header")
+ kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
if head:
body = head + body
@@ -779,8 +783,7 @@ def _data_to_frame(**kwargs):
header = 0
else:
# ignore all-empty-text rows
- header = [i for i, row in enumerate(head)
- if any(text for text in row)]
+ header = [i for i, row in enumerate(head) if any(text for text in row)]
if foot:
body += foot
@@ -792,9 +795,12 @@ def _data_to_frame(**kwargs):
return df
-_valid_parsers = {'lxml': _LxmlFrameParser, None: _LxmlFrameParser,
- 'html5lib': _BeautifulSoupHtml5LibFrameParser,
- 'bs4': _BeautifulSoupHtml5LibFrameParser}
+_valid_parsers = {
+ "lxml": _LxmlFrameParser,
+ None: _LxmlFrameParser,
+ "html5lib": _BeautifulSoupHtml5LibFrameParser,
+ "bs4": _BeautifulSoupHtml5LibFrameParser,
+}
def _parser_dispatch(flavor):
@@ -819,18 +825,18 @@ def _parser_dispatch(flavor):
"""
valid_parsers = list(_valid_parsers.keys())
if flavor not in valid_parsers:
- raise ValueError('{invalid!r} is not a valid flavor, valid flavors '
- 'are {valid}'
- .format(invalid=flavor, valid=valid_parsers))
+ raise ValueError(
+ "{invalid!r} is not a valid flavor, valid flavors "
+ "are {valid}".format(invalid=flavor, valid=valid_parsers)
+ )
- if flavor in ('bs4', 'html5lib'):
+ if flavor in ("bs4", "html5lib"):
if not _HAS_HTML5LIB:
raise ImportError("html5lib not found, please install it")
if not _HAS_BS4:
- raise ImportError(
- "BeautifulSoup4 (bs4) not found, please install it")
+ raise ImportError("BeautifulSoup4 (bs4) not found, please install it")
# Although we call this above, we want to raise here right before use.
- bs4 = import_optional_dependency('bs4') # noqa:F841
+ bs4 = import_optional_dependency("bs4") # noqa:F841
else:
if not _HAS_LXML:
@@ -839,23 +845,23 @@ def _parser_dispatch(flavor):
def _print_as_set(s):
- return ('{' + '{arg}'.format(arg=', '.join(
- pprint_thing(el) for el in s)) + '}')
+ return "{" + "{arg}".format(arg=", ".join(pprint_thing(el) for el in s)) + "}"
def _validate_flavor(flavor):
if flavor is None:
- flavor = 'lxml', 'bs4'
+ flavor = "lxml", "bs4"
elif isinstance(flavor, str):
- flavor = flavor,
+ flavor = (flavor,)
elif isinstance(flavor, abc.Iterable):
if not all(isinstance(flav, str) for flav in flavor):
- raise TypeError('Object of type {typ!r} is not an iterable of '
- 'strings'
- .format(typ=type(flavor).__name__))
+ raise TypeError(
+ "Object of type {typ!r} is not an iterable of "
+ "strings".format(typ=type(flavor).__name__)
+ )
else:
- fmt = '{flavor!r}' if isinstance(flavor, str) else '{flavor}'
- fmt += ' is not a valid flavor'
+ fmt = "{flavor!r}" if isinstance(flavor, str) else "{flavor}"
+ fmt += " is not a valid flavor"
raise ValueError(fmt.format(flavor=flavor))
flavor = tuple(flavor)
@@ -863,10 +869,12 @@ def _validate_flavor(flavor):
flavor_set = set(flavor)
if not flavor_set & valid_flavors:
- raise ValueError('{invalid} is not a valid set of flavors, valid '
- 'flavors are {valid}'
- .format(invalid=_print_as_set(flavor_set),
- valid=_print_as_set(valid_flavors)))
+ raise ValueError(
+ "{invalid} is not a valid set of flavors, valid "
+ "flavors are {valid}".format(
+ invalid=_print_as_set(flavor_set), valid=_print_as_set(valid_flavors)
+ )
+ )
return flavor
@@ -885,15 +893,17 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
except Exception as caught:
# if `io` is an io-like object, check if it's seekable
# and try to rewind it before trying the next parser
- if hasattr(io, 'seekable') and io.seekable():
+ if hasattr(io, "seekable") and io.seekable():
io.seek(0)
- elif hasattr(io, 'seekable') and not io.seekable():
+ elif hasattr(io, "seekable") and not io.seekable():
# if we couldn't rewind it, let the user know
- raise ValueError('The flavor {} failed to parse your input. '
- 'Since you passed a non-rewindable file '
- 'object, we can\'t rewind it to try '
- 'another parser. Try read_html() with a '
- 'different flavor.'.format(flav))
+ raise ValueError(
+ "The flavor {} failed to parse your input. "
+ "Since you passed a non-rewindable file "
+ "object, we can't rewind it to try "
+ "another parser. Try read_html() with a "
+ "different flavor.".format(flav)
+ )
retained = caught
else:
@@ -910,11 +920,23 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
return ret
-def read_html(io, match='.+', flavor=None, header=None, index_col=None,
- skiprows=None, attrs=None, parse_dates=False,
- thousands=',', encoding=None,
- decimal='.', converters=None, na_values=None,
- keep_default_na=True, displayed_only=True):
+def read_html(
+ io,
+ match=".+",
+ flavor=None,
+ header=None,
+ index_col=None,
+ skiprows=None,
+ attrs=None,
+ parse_dates=False,
+ thousands=",",
+ encoding=None,
+ decimal=".",
+ converters=None,
+ na_values=None,
+ keep_default_na=True,
+ displayed_only=True,
+):
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
Parameters
@@ -1060,13 +1082,25 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
# Type check here. We don't want to parse only to fail because of an
# invalid value of an integer skiprows.
if isinstance(skiprows, numbers.Integral) and skiprows < 0:
- raise ValueError('cannot skip rows starting from the end of the '
- 'data (you passed a negative value)')
+ raise ValueError(
+ "cannot skip rows starting from the end of the "
+ "data (you passed a negative value)"
+ )
_validate_header_arg(header)
- return _parse(flavor=flavor, io=io, match=match, header=header,
- index_col=index_col, skiprows=skiprows,
- parse_dates=parse_dates,
- thousands=thousands, attrs=attrs, encoding=encoding,
- decimal=decimal, converters=converters, na_values=na_values,
- keep_default_na=keep_default_na,
- displayed_only=displayed_only)
+ return _parse(
+ flavor=flavor,
+ io=io,
+ match=match,
+ header=header,
+ index_col=index_col,
+ skiprows=skiprows,
+ parse_dates=parse_dates,
+ thousands=thousands,
+ attrs=attrs,
+ encoding=encoding,
+ decimal=decimal,
+ converters=converters,
+ na_values=na_values,
+ keep_default_na=keep_default_na,
+ displayed_only=displayed_only,
+ )
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
index f14b615471ccc..f3f0f417acaab 100644
--- a/pandas/io/json/json.py
+++ b/pandas/io/json/json.py
@@ -14,8 +14,12 @@
from pandas.core.reshape.concat import concat
from pandas.io.common import (
- BaseIterator, _get_handle, _infer_compression, _stringify_path,
- get_filepath_or_buffer)
+ BaseIterator,
+ _get_handle,
+ _infer_compression,
+ _stringify_path,
+ get_filepath_or_buffer,
+)
from pandas.io.formats.printing import pprint_thing
from pandas.io.parsers import _validate_integer
@@ -25,27 +29,36 @@
loads = json.loads
dumps = json.dumps
-TABLE_SCHEMA_VERSION = '0.20.0'
+TABLE_SCHEMA_VERSION = "0.20.0"
# interface to/from
-def to_json(path_or_buf, obj, orient=None, date_format='epoch',
- double_precision=10, force_ascii=True, date_unit='ms',
- default_handler=None, lines=False, compression='infer',
- index=True):
-
- if not index and orient not in ['split', 'table']:
- raise ValueError("'index=False' is only valid when 'orient' is "
- "'split' or 'table'")
+def to_json(
+ path_or_buf,
+ obj,
+ orient=None,
+ date_format="epoch",
+ double_precision=10,
+ force_ascii=True,
+ date_unit="ms",
+ default_handler=None,
+ lines=False,
+ compression="infer",
+ index=True,
+):
+
+ if not index and orient not in ["split", "table"]:
+ raise ValueError(
+ "'index=False' is only valid when 'orient' is " "'split' or 'table'"
+ )
path_or_buf = _stringify_path(path_or_buf)
- if lines and orient != 'records':
- raise ValueError(
- "'lines' keyword only valid when 'orient' is records")
+ if lines and orient != "records":
+ raise ValueError("'lines' keyword only valid when 'orient' is records")
- if orient == 'table' and isinstance(obj, Series):
- obj = obj.to_frame(name=obj.name or 'values')
- if orient == 'table' and isinstance(obj, DataFrame):
+ if orient == "table" and isinstance(obj, Series):
+ obj = obj.to_frame(name=obj.name or "values")
+ if orient == "table" and isinstance(obj, DataFrame):
writer = JSONTableWriter
elif isinstance(obj, Series):
writer = SeriesWriter
@@ -55,16 +68,21 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch',
raise NotImplementedError("'obj' should be a Series or a DataFrame")
s = writer(
- obj, orient=orient, date_format=date_format,
- double_precision=double_precision, ensure_ascii=force_ascii,
- date_unit=date_unit, default_handler=default_handler,
- index=index).write()
+ obj,
+ orient=orient,
+ date_format=date_format,
+ double_precision=double_precision,
+ ensure_ascii=force_ascii,
+ date_unit=date_unit,
+ default_handler=default_handler,
+ index=index,
+ ).write()
if lines:
s = _convert_to_line_delimits(s)
if isinstance(path_or_buf, str):
- fh, handles = _get_handle(path_or_buf, 'w', compression=compression)
+ fh, handles = _get_handle(path_or_buf, "w", compression=compression)
try:
fh.write(s)
finally:
@@ -76,8 +94,17 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch',
class Writer:
- def __init__(self, obj, orient, date_format, double_precision,
- ensure_ascii, date_unit, index, default_handler=None):
+ def __init__(
+ self,
+ obj,
+ orient,
+ date_format,
+ double_precision,
+ ensure_ascii,
+ date_unit,
+ index,
+ default_handler=None,
+ ):
self.obj = obj
if orient is None:
@@ -98,12 +125,26 @@ def _format_axes(self):
raise AbstractMethodError(self)
def write(self):
- return self._write(self.obj, self.orient, self.double_precision,
- self.ensure_ascii, self.date_unit,
- self.date_format == 'iso', self.default_handler)
+ return self._write(
+ self.obj,
+ self.orient,
+ self.double_precision,
+ self.ensure_ascii,
+ self.date_unit,
+ self.date_format == "iso",
+ self.default_handler,
+ )
- def _write(self, obj, orient, double_precision, ensure_ascii,
- date_unit, iso_dates, default_handler):
+ def _write(
+ self,
+ obj,
+ orient,
+ double_precision,
+ ensure_ascii,
+ date_unit,
+ iso_dates,
+ default_handler,
+ ):
return dumps(
obj,
orient=orient,
@@ -111,91 +152,147 @@ def _write(self, obj, orient, double_precision, ensure_ascii,
ensure_ascii=ensure_ascii,
date_unit=date_unit,
iso_dates=iso_dates,
- default_handler=default_handler
+ default_handler=default_handler,
)
class SeriesWriter(Writer):
- _default_orient = 'index'
+ _default_orient = "index"
def _format_axes(self):
- if not self.obj.index.is_unique and self.orient == 'index':
- raise ValueError("Series index must be unique for orient="
- "'{orient}'".format(orient=self.orient))
+ if not self.obj.index.is_unique and self.orient == "index":
+ raise ValueError(
+ "Series index must be unique for orient="
+ "'{orient}'".format(orient=self.orient)
+ )
- def _write(self, obj, orient, double_precision, ensure_ascii,
- date_unit, iso_dates, default_handler):
- if not self.index and orient == 'split':
+ def _write(
+ self,
+ obj,
+ orient,
+ double_precision,
+ ensure_ascii,
+ date_unit,
+ iso_dates,
+ default_handler,
+ ):
+ if not self.index and orient == "split":
obj = {"name": obj.name, "data": obj.values}
- return super()._write(obj, orient, double_precision, ensure_ascii,
- date_unit, iso_dates, default_handler)
+ return super()._write(
+ obj,
+ orient,
+ double_precision,
+ ensure_ascii,
+ date_unit,
+ iso_dates,
+ default_handler,
+ )
class FrameWriter(Writer):
- _default_orient = 'columns'
+ _default_orient = "columns"
def _format_axes(self):
"""
Try to format axes if they are datelike.
"""
- if not self.obj.index.is_unique and self.orient in (
- 'index', 'columns'):
- raise ValueError("DataFrame index must be unique for orient="
- "'{orient}'.".format(orient=self.orient))
+ if not self.obj.index.is_unique and self.orient in ("index", "columns"):
+ raise ValueError(
+ "DataFrame index must be unique for orient="
+ "'{orient}'.".format(orient=self.orient)
+ )
if not self.obj.columns.is_unique and self.orient in (
- 'index', 'columns', 'records'):
- raise ValueError("DataFrame columns must be unique for orient="
- "'{orient}'.".format(orient=self.orient))
-
- def _write(self, obj, orient, double_precision, ensure_ascii,
- date_unit, iso_dates, default_handler):
- if not self.index and orient == 'split':
- obj = obj.to_dict(orient='split')
+ "index",
+ "columns",
+ "records",
+ ):
+ raise ValueError(
+ "DataFrame columns must be unique for orient="
+ "'{orient}'.".format(orient=self.orient)
+ )
+
+ def _write(
+ self,
+ obj,
+ orient,
+ double_precision,
+ ensure_ascii,
+ date_unit,
+ iso_dates,
+ default_handler,
+ ):
+ if not self.index and orient == "split":
+ obj = obj.to_dict(orient="split")
del obj["index"]
- return super()._write(obj, orient, double_precision, ensure_ascii,
- date_unit, iso_dates, default_handler)
+ return super()._write(
+ obj,
+ orient,
+ double_precision,
+ ensure_ascii,
+ date_unit,
+ iso_dates,
+ default_handler,
+ )
class JSONTableWriter(FrameWriter):
- _default_orient = 'records'
-
- def __init__(self, obj, orient, date_format, double_precision,
- ensure_ascii, date_unit, index, default_handler=None):
+ _default_orient = "records"
+
+ def __init__(
+ self,
+ obj,
+ orient,
+ date_format,
+ double_precision,
+ ensure_ascii,
+ date_unit,
+ index,
+ default_handler=None,
+ ):
"""
Adds a `schema` attribute with the Table Schema, resets
the index (can't do in caller, because the schema inference needs
to know what the index is, forces orient to records, and forces
date_format to 'iso'.
"""
- super().__init__(obj, orient, date_format, double_precision,
- ensure_ascii, date_unit, index,
- default_handler=default_handler)
-
- if date_format != 'iso':
- msg = ("Trying to write with `orient='table'` and "
- "`date_format='{fmt}'`. Table Schema requires dates "
- "to be formatted with `date_format='iso'`"
- .format(fmt=date_format))
+ super().__init__(
+ obj,
+ orient,
+ date_format,
+ double_precision,
+ ensure_ascii,
+ date_unit,
+ index,
+ default_handler=default_handler,
+ )
+
+ if date_format != "iso":
+ msg = (
+ "Trying to write with `orient='table'` and "
+ "`date_format='{fmt}'`. Table Schema requires dates "
+ "to be formatted with `date_format='iso'`".format(fmt=date_format)
+ )
raise ValueError(msg)
self.schema = build_table_schema(obj, index=self.index)
# NotImplemented on a column MultiIndex
if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
- raise NotImplementedError(
- "orient='table' is not supported for MultiIndex")
+ raise NotImplementedError("orient='table' is not supported for MultiIndex")
# TODO: Do this timedelta properly in objToJSON.c See GH #15137
- if ((obj.ndim == 1) and (obj.name in set(obj.index.names)) or
- len(obj.columns & obj.index.names)):
+ if (
+ (obj.ndim == 1)
+ and (obj.name in set(obj.index.names))
+ or len(obj.columns & obj.index.names)
+ ):
msg = "Overlapping names between the index and columns"
raise ValueError(msg)
obj = obj.copy()
- timedeltas = obj.select_dtypes(include=['timedelta']).columns
+ timedeltas = obj.select_dtypes(include=["timedelta"]).columns
if len(timedeltas):
- obj[timedeltas] = obj[timedeltas].applymap(
- lambda x: x.isoformat())
+ obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat())
# Convert PeriodIndex to datetimes before serialzing
if is_period_dtype(obj.index):
obj.index = obj.index.to_timestamp()
@@ -205,23 +302,51 @@ def __init__(self, obj, orient, date_format, double_precision,
self.obj = obj.reset_index(drop=True)
else:
self.obj = obj.reset_index(drop=False)
- self.date_format = 'iso'
- self.orient = 'records'
+ self.date_format = "iso"
+ self.orient = "records"
self.index = index
- def _write(self, obj, orient, double_precision, ensure_ascii,
- date_unit, iso_dates, default_handler):
- data = super()._write(obj, orient, double_precision, ensure_ascii,
- date_unit, iso_dates, default_handler)
+ def _write(
+ self,
+ obj,
+ orient,
+ double_precision,
+ ensure_ascii,
+ date_unit,
+ iso_dates,
+ default_handler,
+ ):
+ data = super()._write(
+ obj,
+ orient,
+ double_precision,
+ ensure_ascii,
+ date_unit,
+ iso_dates,
+ default_handler,
+ )
serialized = '{{"schema": {schema}, "data": {data}}}'.format(
- schema=dumps(self.schema), data=data)
+ schema=dumps(self.schema), data=data
+ )
return serialized
-def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None,
- convert_axes=None, convert_dates=True, keep_default_dates=True,
- numpy=False, precise_float=False, date_unit=None, encoding=None,
- lines=False, chunksize=None, compression='infer'):
+def read_json(
+ path_or_buf=None,
+ orient=None,
+ typ="frame",
+ dtype=None,
+ convert_axes=None,
+ convert_dates=True,
+ keep_default_dates=True,
+ numpy=False,
+ precise_float=False,
+ date_unit=None,
+ encoding=None,
+ lines=False,
+ chunksize=None,
+ compression="infer",
+):
"""
Convert a JSON string to pandas object.
@@ -414,27 +539,36 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None,
{"index": "row 2", "col 1": "c", "col 2": "d"}]}'
"""
- if orient == 'table' and dtype:
+ if orient == "table" and dtype:
raise ValueError("cannot pass both dtype and orient='table'")
- if orient == 'table' and convert_axes:
+ if orient == "table" and convert_axes:
raise ValueError("cannot pass both convert_axes and orient='table'")
- if dtype is None and orient != 'table':
+ if dtype is None and orient != "table":
dtype = True
- if convert_axes is None and orient != 'table':
+ if convert_axes is None and orient != "table":
convert_axes = True
compression = _infer_compression(path_or_buf, compression)
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
- path_or_buf, encoding=encoding, compression=compression,
+ path_or_buf, encoding=encoding, compression=compression
)
json_reader = JsonReader(
- filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
- convert_axes=convert_axes, convert_dates=convert_dates,
- keep_default_dates=keep_default_dates, numpy=numpy,
- precise_float=precise_float, date_unit=date_unit, encoding=encoding,
- lines=lines, chunksize=chunksize, compression=compression,
+ filepath_or_buffer,
+ orient=orient,
+ typ=typ,
+ dtype=dtype,
+ convert_axes=convert_axes,
+ convert_dates=convert_dates,
+ keep_default_dates=keep_default_dates,
+ numpy=numpy,
+ precise_float=precise_float,
+ date_unit=date_unit,
+ encoding=encoding,
+ lines=lines,
+ chunksize=chunksize,
+ compression=compression,
)
if chunksize:
@@ -457,9 +591,24 @@ class JsonReader(BaseIterator):
``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
whole document.
"""
- def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
- convert_dates, keep_default_dates, numpy, precise_float,
- date_unit, encoding, lines, chunksize, compression):
+
+ def __init__(
+ self,
+ filepath_or_buffer,
+ orient,
+ typ,
+ dtype,
+ convert_axes,
+ convert_dates,
+ keep_default_dates,
+ numpy,
+ precise_float,
+ date_unit,
+ encoding,
+ lines,
+ chunksize,
+ compression,
+ ):
self.path_or_buf = filepath_or_buffer
self.orient = orient
@@ -494,9 +643,9 @@ def _preprocess_data(self, data):
If self.chunksize, we prepare the data for the `__next__` method.
Otherwise, we read it into memory for the `read` method.
"""
- if hasattr(data, 'read') and not self.chunksize:
+ if hasattr(data, "read") and not self.chunksize:
data = data.read()
- if not hasattr(data, 'read') and self.chunksize:
+ if not hasattr(data, "read") and self.chunksize:
data = StringIO(data)
return data
@@ -522,9 +671,12 @@ def _get_data_from_filepath(self, filepath_or_buffer):
pass
if exists or self.compression is not None:
- data, _ = _get_handle(filepath_or_buffer, 'r',
- encoding=self.encoding,
- compression=self.compression)
+ data, _ = _get_handle(
+ filepath_or_buffer,
+ "r",
+ encoding=self.encoding,
+ compression=self.compression,
+ )
self.should_close = True
self.open_stream = data
@@ -535,7 +687,7 @@ def _combine_lines(self, lines):
Combines a list of JSON objects into one JSON object.
"""
lines = filter(None, map(lambda x: x.strip(), lines))
- return '[' + ','.join(lines) + ']'
+ return "[" + ",".join(lines) + "]"
def read(self):
"""
@@ -545,9 +697,7 @@ def read(self):
obj = concat(self)
elif self.lines:
data = ensure_str(self.data)
- obj = self._get_object_parser(
- self._combine_lines(data.split('\n'))
- )
+ obj = self._get_object_parser(self._combine_lines(data.split("\n")))
else:
obj = self._get_object_parser(self.data)
self.close()
@@ -560,19 +710,22 @@ def _get_object_parser(self, json):
typ = self.typ
dtype = self.dtype
kwargs = {
- "orient": self.orient, "dtype": self.dtype,
+ "orient": self.orient,
+ "dtype": self.dtype,
"convert_axes": self.convert_axes,
"convert_dates": self.convert_dates,
- "keep_default_dates": self.keep_default_dates, "numpy": self.numpy,
- "precise_float": self.precise_float, "date_unit": self.date_unit
+ "keep_default_dates": self.keep_default_dates,
+ "numpy": self.numpy,
+ "precise_float": self.precise_float,
+ "date_unit": self.date_unit,
}
obj = None
- if typ == 'frame':
+ if typ == "frame":
obj = FrameParser(json, **kwargs).parse()
- if typ == 'series' or obj is None:
+ if typ == "series" or obj is None:
if not isinstance(dtype, bool):
- kwargs['dtype'] = dtype
+ kwargs["dtype"] = dtype
obj = SeriesParser(json, **kwargs).parse()
return obj
@@ -608,16 +761,26 @@ def __next__(self):
class Parser:
- _STAMP_UNITS = ('s', 'ms', 'us', 'ns')
+ _STAMP_UNITS = ("s", "ms", "us", "ns")
_MIN_STAMPS = {
- 's': 31536000,
- 'ms': 31536000000,
- 'us': 31536000000000,
- 'ns': 31536000000000000}
-
- def __init__(self, json, orient, dtype=None, convert_axes=True,
- convert_dates=True, keep_default_dates=False, numpy=False,
- precise_float=False, date_unit=None):
+ "s": 31536000,
+ "ms": 31536000000,
+ "us": 31536000000000,
+ "ns": 31536000000000000,
+ }
+
+ def __init__(
+ self,
+ json,
+ orient,
+ dtype=None,
+ convert_axes=True,
+ convert_dates=True,
+ keep_default_dates=False,
+ numpy=False,
+ precise_float=False,
+ date_unit=None,
+ ):
self.json = json
if orient is None:
@@ -632,11 +795,12 @@ def __init__(self, json, orient, dtype=None, convert_axes=True,
if date_unit is not None:
date_unit = date_unit.lower()
if date_unit not in self._STAMP_UNITS:
- raise ValueError('date_unit must be one of {units}'
- .format(units=self._STAMP_UNITS))
+ raise ValueError(
+ "date_unit must be one of {units}".format(units=self._STAMP_UNITS)
+ )
self.min_stamp = self._MIN_STAMPS[date_unit]
else:
- self.min_stamp = self._MIN_STAMPS['s']
+ self.min_stamp = self._MIN_STAMPS["s"]
self.numpy = numpy
self.precise_float = precise_float
@@ -653,8 +817,11 @@ def check_keys_split(self, decoded):
bad_keys = set(decoded.keys()).difference(set(self._split_keys))
if bad_keys:
bad_keys = ", ".join(bad_keys)
- raise ValueError("JSON data had unexpected key(s): {bad_keys}"
- .format(bad_keys=pprint_thing(bad_keys)))
+ raise ValueError(
+ "JSON data had unexpected key(s): {bad_keys}".format(
+ bad_keys=pprint_thing(bad_keys)
+ )
+ )
def parse(self):
@@ -679,16 +846,15 @@ def _convert_axes(self):
"""
for axis in self.obj._AXIS_NUMBERS.keys():
new_axis, result = self._try_convert_data(
- axis, self.obj._get_axis(axis), use_dtypes=False,
- convert_dates=True)
+ axis, self.obj._get_axis(axis), use_dtypes=False, convert_dates=True
+ )
if result:
setattr(self.obj, axis, new_axis)
def _try_convert_types(self):
raise AbstractMethodError(self)
- def _try_convert_data(self, name, data, use_dtypes=True,
- convert_dates=True):
+ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
"""
Try to parse a ndarray like into a column by inferring dtype.
"""
@@ -701,8 +867,9 @@ def _try_convert_data(self, name, data, use_dtypes=True,
pass
else:
# dtype to force
- dtype = (self.dtype.get(name)
- if isinstance(self.dtype, dict) else self.dtype)
+ dtype = (
+ self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype
+ )
if dtype is not None:
try:
dtype = np.dtype(dtype)
@@ -717,32 +884,32 @@ def _try_convert_data(self, name, data, use_dtypes=True,
result = False
- if data.dtype == 'object':
+ if data.dtype == "object":
# try float
try:
- data = data.astype('float64')
+ data = data.astype("float64")
result = True
except (TypeError, ValueError):
pass
- if data.dtype.kind == 'f':
+ if data.dtype.kind == "f":
- if data.dtype != 'float64':
+ if data.dtype != "float64":
# coerce floats to 64
try:
- data = data.astype('float64')
+ data = data.astype("float64")
result = True
except (TypeError, ValueError):
pass
# don't coerce 0-len data
- if len(data) and (data.dtype == 'float' or data.dtype == 'object'):
+ if len(data) and (data.dtype == "float" or data.dtype == "object"):
# coerce ints if we can
try:
- new_data = data.astype('int64')
+ new_data = data.astype("int64")
if (new_data == data).all():
data = new_data
result = True
@@ -750,11 +917,11 @@ def _try_convert_data(self, name, data, use_dtypes=True,
pass
# coerce ints to 64
- if data.dtype == 'int':
+ if data.dtype == "int":
# coerce floats to 64
try:
- data = data.astype('int64')
+ data = data.astype("int64")
result = True
except (TypeError, ValueError):
pass
@@ -774,24 +941,26 @@ def _try_convert_to_date(self, data):
return data, False
new_data = data
- if new_data.dtype == 'object':
+ if new_data.dtype == "object":
try:
- new_data = data.astype('int64')
+ new_data = data.astype("int64")
except (TypeError, ValueError, OverflowError):
pass
# ignore numbers that are out of range
if issubclass(new_data.dtype.type, np.number):
- in_range = (isna(new_data.values) | (new_data > self.min_stamp) |
- (new_data.values == iNaT))
+ in_range = (
+ isna(new_data.values)
+ | (new_data > self.min_stamp)
+ | (new_data.values == iNaT)
+ )
if not in_range.all():
return data, False
date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
for date_unit in date_units:
try:
- new_data = to_datetime(new_data, errors='raise',
- unit=date_unit)
+ new_data = to_datetime(new_data, errors="raise", unit=date_unit)
except ValueError:
continue
except Exception:
@@ -804,52 +973,62 @@ def _try_convert_dates(self):
class SeriesParser(Parser):
- _default_orient = 'index'
- _split_keys = ('name', 'index', 'data')
+ _default_orient = "index"
+ _split_keys = ("name", "index", "data")
def _parse_no_numpy(self):
json = self.json
orient = self.orient
if orient == "split":
- decoded = {str(k): v for k, v in loads(
- json, precise_float=self.precise_float).items()}
+ decoded = {
+ str(k): v
+ for k, v in loads(json, precise_float=self.precise_float).items()
+ }
self.check_keys_split(decoded)
self.obj = Series(dtype=None, **decoded)
else:
- self.obj = Series(
- loads(json, precise_float=self.precise_float), dtype=None)
+ self.obj = Series(loads(json, precise_float=self.precise_float), dtype=None)
def _parse_numpy(self):
json = self.json
orient = self.orient
if orient == "split":
- decoded = loads(json, dtype=None, numpy=True,
- precise_float=self.precise_float)
+ decoded = loads(
+ json, dtype=None, numpy=True, precise_float=self.precise_float
+ )
decoded = {str(k): v for k, v in decoded.items()}
self.check_keys_split(decoded)
self.obj = Series(**decoded)
elif orient == "columns" or orient == "index":
- self.obj = Series(*loads(json, dtype=None, numpy=True,
- labelled=True,
- precise_float=self.precise_float))
+ self.obj = Series(
+ *loads(
+ json,
+ dtype=None,
+ numpy=True,
+ labelled=True,
+ precise_float=self.precise_float,
+ )
+ )
else:
- self.obj = Series(loads(json, dtype=None, numpy=True,
- precise_float=self.precise_float))
+ self.obj = Series(
+ loads(json, dtype=None, numpy=True, precise_float=self.precise_float)
+ )
def _try_convert_types(self):
if self.obj is None:
return
obj, result = self._try_convert_data(
- 'data', self.obj, convert_dates=self.convert_dates)
+ "data", self.obj, convert_dates=self.convert_dates
+ )
if result:
self.obj = obj
class FrameParser(Parser):
- _default_orient = 'columns'
- _split_keys = ('columns', 'index', 'data')
+ _default_orient = "columns"
+ _split_keys = ("columns", "index", "data")
def _parse_numpy(self):
@@ -857,24 +1036,37 @@ def _parse_numpy(self):
orient = self.orient
if orient == "columns":
- args = loads(json, dtype=None, numpy=True, labelled=True,
- precise_float=self.precise_float)
+ args = loads(
+ json,
+ dtype=None,
+ numpy=True,
+ labelled=True,
+ precise_float=self.precise_float,
+ )
if len(args):
args = (args[0].T, args[2], args[1])
self.obj = DataFrame(*args)
elif orient == "split":
- decoded = loads(json, dtype=None, numpy=True,
- precise_float=self.precise_float)
+ decoded = loads(
+ json, dtype=None, numpy=True, precise_float=self.precise_float
+ )
decoded = {str(k): v for k, v in decoded.items()}
self.check_keys_split(decoded)
self.obj = DataFrame(**decoded)
elif orient == "values":
- self.obj = DataFrame(loads(json, dtype=None, numpy=True,
- precise_float=self.precise_float))
+ self.obj = DataFrame(
+ loads(json, dtype=None, numpy=True, precise_float=self.precise_float)
+ )
else:
- self.obj = DataFrame(*loads(json, dtype=None, numpy=True,
- labelled=True,
- precise_float=self.precise_float))
+ self.obj = DataFrame(
+ *loads(
+ json,
+ dtype=None,
+ numpy=True,
+ labelled=True,
+ precise_float=self.precise_float,
+ )
+ )
def _parse_no_numpy(self):
@@ -883,21 +1075,25 @@ def _parse_no_numpy(self):
if orient == "columns":
self.obj = DataFrame(
- loads(json, precise_float=self.precise_float), dtype=None)
+ loads(json, precise_float=self.precise_float), dtype=None
+ )
elif orient == "split":
- decoded = {str(k): v for k, v in loads(
- json, precise_float=self.precise_float).items()}
+ decoded = {
+ str(k): v
+ for k, v in loads(json, precise_float=self.precise_float).items()
+ }
self.check_keys_split(decoded)
self.obj = DataFrame(dtype=None, **decoded)
elif orient == "index":
self.obj = DataFrame(
- loads(json, precise_float=self.precise_float), dtype=None).T
- elif orient == 'table':
- self.obj = parse_table_schema(json,
- precise_float=self.precise_float)
+ loads(json, precise_float=self.precise_float), dtype=None
+ ).T
+ elif orient == "table":
+ self.obj = parse_table_schema(json, precise_float=self.precise_float)
else:
self.obj = DataFrame(
- loads(json, precise_float=self.precise_float), dtype=None)
+ loads(json, precise_float=self.precise_float), dtype=None
+ )
def _process_converter(self, f, filt=None):
"""
@@ -931,7 +1127,8 @@ def _try_convert_types(self):
self._try_convert_dates()
self._process_converter(
- lambda col, c: self._try_convert_data(col, c, convert_dates=False))
+ lambda col, c: self._try_convert_data(col, c, convert_dates=False)
+ )
def _try_convert_dates(self):
if self.obj is None:
@@ -951,16 +1148,20 @@ def is_ok(col):
return False
col_lower = col.lower()
- if (col_lower.endswith('_at') or
- col_lower.endswith('_time') or
- col_lower == 'modified' or
- col_lower == 'date' or
- col_lower == 'datetime' or
- col_lower.startswith('timestamp')):
+ if (
+ col_lower.endswith("_at")
+ or col_lower.endswith("_time")
+ or col_lower == "modified"
+ or col_lower == "date"
+ or col_lower == "datetime"
+ or col_lower.startswith("timestamp")
+ ):
return True
return False
self._process_converter(
lambda col, c: self._try_convert_to_date(c),
- lambda col, c: ((self.keep_default_dates and is_ok(col)) or
- col in convert_dates))
+ lambda col, c: (
+ (self.keep_default_dates and is_ok(col)) or col in convert_dates
+ ),
+ )
diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py
index 5c6018d399c82..c09dc177ccbd1 100644
--- a/pandas/io/json/normalize.py
+++ b/pandas/io/json/normalize.py
@@ -19,16 +19,20 @@ def _convert_to_line_delimits(s):
# Determine we have a JSON list to turn to lines otherwise just return the
# json object, only lists can
- if not s[0] == '[' and s[-1] == ']':
+ if not s[0] == "[" and s[-1] == "]":
return s
s = s[1:-1]
return convert_json_to_lines(s)
-def nested_to_record(ds, prefix: str = "",
- sep: str = ".", level: int = 0,
- max_level: Optional[int] = None):
+def nested_to_record(
+ ds,
+ prefix: str = "",
+ sep: str = ".",
+ level: int = 0,
+ max_level: Optional[int] = None,
+):
"""
A simplified json_normalize
@@ -90,16 +94,16 @@ def nested_to_record(ds, prefix: str = "",
# current dict level < maximum level provided and
# only dicts gets recurse-flattened
# only at level>1 do we rename the rest of the keys
- if (not isinstance(v, dict) or
- (max_level is not None and level >= max_level)):
+ if not isinstance(v, dict) or (
+ max_level is not None and level >= max_level
+ ):
if level != 0: # so we skip copying for top level, common case
v = new_d.pop(k)
new_d[newkey] = v
continue
else:
v = new_d.pop(k)
- new_d.update(nested_to_record(v, newkey, sep, level + 1,
- max_level))
+ new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
new_ds.append(new_d)
if singleton:
@@ -107,14 +111,16 @@ def nested_to_record(ds, prefix: str = "",
return new_ds
-def json_normalize(data: List[Dict],
- record_path: Optional[Union[str, List]] = None,
- meta: Optional[Union[str, List]] = None,
- meta_prefix: Optional[str] = None,
- record_prefix: Optional[str] = None,
- errors: Optional[str] = 'raise',
- sep: str = '.',
- max_level: Optional[int] = None):
+def json_normalize(
+ data: List[Dict],
+ record_path: Optional[Union[str, List]] = None,
+ meta: Optional[Union[str, List]] = None,
+ meta_prefix: Optional[str] = None,
+ record_prefix: Optional[str] = None,
+ errors: Optional[str] = "raise",
+ sep: str = ".",
+ max_level: Optional[int] = None,
+):
"""
Normalize semi-structured JSON data into a flat table.
@@ -230,6 +236,7 @@ def json_normalize(data: List[Dict],
Returns normalized data with columns prefixed with the given string.
"""
+
def _pull_field(js, spec):
result = js
if isinstance(spec, list):
@@ -256,8 +263,7 @@ def _pull_field(js, spec):
#
# TODO: handle record value which are lists, at least error
# reasonably
- data = nested_to_record(data, sep=sep,
- max_level=max_level)
+ data = nested_to_record(data, sep=sep, max_level=max_level)
return DataFrame(data)
elif not isinstance(record_path, list):
record_path = [record_path]
@@ -287,14 +293,16 @@ def _recursive_extract(data, path, seen_meta, level=0):
if level + 1 == len(val):
seen_meta[key] = _pull_field(obj, val[-1])
- _recursive_extract(obj[path[0]], path[1:],
- seen_meta, level=level + 1)
+ _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
else:
for obj in data:
recs = _pull_field(obj, path[0])
- recs = [nested_to_record(r, sep=sep,
- max_level=max_level)
- if isinstance(r, dict) else r for r in recs]
+ recs = [
+ nested_to_record(r, sep=sep, max_level=max_level)
+ if isinstance(r, dict)
+ else r
+ for r in recs
+ ]
# For repeating the metadata later
lengths.append(len(recs))
@@ -305,13 +313,14 @@ def _recursive_extract(data, path, seen_meta, level=0):
try:
meta_val = _pull_field(obj, val[level:])
except KeyError as e:
- if errors == 'ignore':
+ if errors == "ignore":
meta_val = np.nan
else:
- raise KeyError("Try running with "
- "errors='ignore' as key "
- "{err} is not always present"
- .format(err=e))
+ raise KeyError(
+ "Try running with "
+ "errors='ignore' as key "
+ "{err} is not always present".format(err=e)
+ )
meta_vals[key].append(meta_val)
records.extend(recs)
@@ -320,8 +329,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
result = DataFrame(records)
if record_prefix is not None:
- result = result.rename(
- columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))
+ result = result.rename(columns=lambda x: "{p}{c}".format(p=record_prefix, c=x))
# Data types, a problem
for k, v in meta_vals.items():
@@ -329,7 +337,9 @@ def _recursive_extract(data, path, seen_meta, level=0):
k = meta_prefix + k
if k in result:
- raise ValueError('Conflicting metadata name {name}, '
- 'need distinguishing prefix '.format(name=k))
+ raise ValueError(
+ "Conflicting metadata name {name}, "
+ "need distinguishing prefix ".format(name=k)
+ )
result[k] = np.array(v, dtype=object).repeat(lengths)
return result
diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py
index a54f5cdf723a3..045127c63af5c 100644
--- a/pandas/io/json/table_schema.py
+++ b/pandas/io/json/table_schema.py
@@ -8,9 +8,16 @@
import pandas._libs.json as json
from pandas.core.dtypes.common import (
- is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
- is_datetime64tz_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype,
- is_string_dtype, is_timedelta64_dtype)
+ is_bool_dtype,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_integer_dtype,
+ is_numeric_dtype,
+ is_period_dtype,
+ is_string_dtype,
+ is_timedelta64_dtype,
+)
from pandas import DataFrame
from pandas.api.types import CategoricalDtype
@@ -50,70 +57,71 @@ def as_json_table_type(x):
=============== =================
"""
if is_integer_dtype(x):
- return 'integer'
+ return "integer"
elif is_bool_dtype(x):
- return 'boolean'
+ return "boolean"
elif is_numeric_dtype(x):
- return 'number'
- elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
- is_period_dtype(x)):
- return 'datetime'
+ return "number"
+ elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x):
+ return "datetime"
elif is_timedelta64_dtype(x):
- return 'duration'
+ return "duration"
elif is_categorical_dtype(x):
- return 'any'
+ return "any"
elif is_string_dtype(x):
- return 'string'
+ return "string"
else:
- return 'any'
+ return "any"
def set_default_names(data):
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
if com._all_not_none(*data.index.names):
nms = data.index.names
- if len(nms) == 1 and data.index.name == 'index':
+ if len(nms) == 1 and data.index.name == "index":
warnings.warn("Index name of 'index' is not round-trippable")
- elif len(nms) > 1 and any(x.startswith('level_') for x in nms):
- warnings.warn("Index names beginning with 'level_' are not "
- "round-trippable")
+ elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
+ warnings.warn(
+ "Index names beginning with 'level_' are not " "round-trippable"
+ )
return data
data = data.copy()
if data.index.nlevels > 1:
- names = [name if name is not None else 'level_{}'.format(i)
- for i, name in enumerate(data.index.names)]
+ names = [
+ name if name is not None else "level_{}".format(i)
+ for i, name in enumerate(data.index.names)
+ ]
data.index.names = names
else:
- data.index.name = data.index.name or 'index'
+ data.index.name = data.index.name or "index"
return data
def convert_pandas_type_to_json_field(arr, dtype=None):
dtype = dtype or arr.dtype
if arr.name is None:
- name = 'values'
+ name = "values"
else:
name = arr.name
- field = {'name': name,
- 'type': as_json_table_type(dtype)}
+ field = {"name": name, "type": as_json_table_type(dtype)}
if is_categorical_dtype(arr):
- if hasattr(arr, 'categories'):
+ if hasattr(arr, "categories"):
cats = arr.categories
ordered = arr.ordered
else:
cats = arr.cat.categories
ordered = arr.cat.ordered
- field['constraints'] = {"enum": list(cats)}
- field['ordered'] = ordered
+ field["constraints"] = {"enum": list(cats)}
+ field["ordered"] = ordered
elif is_period_dtype(arr):
- field['freq'] = arr.freqstr
+ field["freq"] = arr.freqstr
elif is_datetime64tz_dtype(arr):
- if hasattr(arr, 'dt'):
- field['tz'] = arr.dt.tz.zone
+ if hasattr(arr, "dt"):
+ field["tz"] = arr.dt.tz.zone
else:
- field['tz'] = arr.tz.zone
+ field["tz"] = arr.tz.zone
return field
@@ -154,28 +162,29 @@ def convert_json_field_to_pandas_type(field):
'tz': 'US/Central'})
'datetime64[ns, US/Central]'
"""
- typ = field['type']
- if typ == 'string':
- return 'object'
- elif typ == 'integer':
- return 'int64'
- elif typ == 'number':
- return 'float64'
- elif typ == 'boolean':
- return 'bool'
- elif typ == 'duration':
- return 'timedelta64'
- elif typ == 'datetime':
- if field.get('tz'):
- return 'datetime64[ns, {tz}]'.format(tz=field['tz'])
+ typ = field["type"]
+ if typ == "string":
+ return "object"
+ elif typ == "integer":
+ return "int64"
+ elif typ == "number":
+ return "float64"
+ elif typ == "boolean":
+ return "bool"
+ elif typ == "duration":
+ return "timedelta64"
+ elif typ == "datetime":
+ if field.get("tz"):
+ return "datetime64[ns, {tz}]".format(tz=field["tz"])
else:
- return 'datetime64[ns]'
- elif typ == 'any':
- if 'constraints' in field and 'ordered' in field:
- return CategoricalDtype(categories=field['constraints']['enum'],
- ordered=field['ordered'])
+ return "datetime64[ns]"
+ elif typ == "any":
+ if "constraints" in field and "ordered" in field:
+ return CategoricalDtype(
+ categories=field["constraints"]["enum"], ordered=field["ordered"]
+ )
else:
- return 'object'
+ return "object"
raise ValueError("Unsupported or invalid field type: {}".format(typ))
@@ -245,17 +254,17 @@ def build_table_schema(data, index=True, primary_key=None, version=True):
else:
fields.append(convert_pandas_type_to_json_field(data))
- schema['fields'] = fields
+ schema["fields"] = fields
if index and data.index.is_unique and primary_key is None:
if data.index.nlevels == 1:
- schema['primaryKey'] = [data.index.name]
+ schema["primaryKey"] = [data.index.name]
else:
- schema['primaryKey'] = data.index.names
+ schema["primaryKey"] = data.index.names
elif primary_key is not None:
- schema['primaryKey'] = primary_key
+ schema["primaryKey"] = primary_key
if version:
- schema['pandas_version'] = '0.20.0'
+ schema["pandas_version"] = "0.20.0"
return schema
@@ -296,31 +305,34 @@ def parse_table_schema(json, precise_float):
pandas.read_json
"""
table = loads(json, precise_float=precise_float)
- col_order = [field['name'] for field in table['schema']['fields']]
- df = DataFrame(table['data'], columns=col_order)[col_order]
+ col_order = [field["name"] for field in table["schema"]["fields"]]
+ df = DataFrame(table["data"], columns=col_order)[col_order]
- dtypes = {field['name']: convert_json_field_to_pandas_type(field)
- for field in table['schema']['fields']}
+ dtypes = {
+ field["name"]: convert_json_field_to_pandas_type(field)
+ for field in table["schema"]["fields"]
+ }
# Cannot directly use as_type with timezone data on object; raise for now
- if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
- raise NotImplementedError('table="orient" can not yet read timezone '
- 'data')
+ if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()):
+ raise NotImplementedError('table="orient" can not yet read timezone ' "data")
# No ISO constructor for Timedelta as of yet, so need to raise
- if 'timedelta64' in dtypes.values():
- raise NotImplementedError('table="orient" can not yet read '
- 'ISO-formatted Timedelta data')
+ if "timedelta64" in dtypes.values():
+ raise NotImplementedError(
+ 'table="orient" can not yet read ' "ISO-formatted Timedelta data"
+ )
df = df.astype(dtypes)
- if 'primaryKey' in table['schema']:
- df = df.set_index(table['schema']['primaryKey'])
+ if "primaryKey" in table["schema"]:
+ df = df.set_index(table["schema"]["primaryKey"])
if len(df.index.names) == 1:
- if df.index.name == 'index':
+ if df.index.name == "index":
df.index.name = None
else:
- df.index.names = [None if x.startswith('level_') else x for x in
- df.index.names]
+ df.index.names = [
+ None if x.startswith("level_") else x for x in df.index.names
+ ]
return df
diff --git a/pandas/io/msgpack/__init__.py b/pandas/io/msgpack/__init__.py
index f8feffcf49240..9b09cffd83f75 100644
--- a/pandas/io/msgpack/__init__.py
+++ b/pandas/io/msgpack/__init__.py
@@ -6,8 +6,9 @@
from pandas.io.msgpack._version import version # noqa
-class ExtType(namedtuple('ExtType', 'code data')):
+class ExtType(namedtuple("ExtType", "code data")):
"""ExtType represents ext type in msgpack."""
+
def __new__(cls, code, data):
if not isinstance(code, int):
raise TypeError("code must be int")
@@ -17,6 +18,7 @@ def __new__(cls, code, data):
raise ValueError("code must be 0~127")
return super().__new__(cls, code, data)
+
import os # noqa
from pandas.io.msgpack._packer import Packer # noqa
diff --git a/pandas/io/msgpack/exceptions.py b/pandas/io/msgpack/exceptions.py
index ae0f74a6700bd..40f5a8af8f583 100644
--- a/pandas/io/msgpack/exceptions.py
+++ b/pandas/io/msgpack/exceptions.py
@@ -15,7 +15,6 @@ class UnpackValueError(UnpackException, ValueError):
class ExtraData(ValueError):
-
def __init__(self, unpacked, extra):
self.unpacked = unpacked
self.extra = extra
diff --git a/pandas/io/packers.py b/pandas/io/packers.py
index e43f94e28d4af..b0ce7a4ccb12a 100644
--- a/pandas/io/packers.py
+++ b/pandas/io/packers.py
@@ -49,16 +49,37 @@
from pandas.compat._optional import import_optional_dependency
from pandas.errors import PerformanceWarning
from pandas.util._move import (
- BadMove as _BadMove, move_into_mutable_buffer as _move_into_mutable_buffer)
+ BadMove as _BadMove,
+ move_into_mutable_buffer as _move_into_mutable_buffer,
+)
from pandas.core.dtypes.common import (
- is_categorical_dtype, is_datetime64tz_dtype, is_object_dtype,
- needs_i8_conversion, pandas_dtype)
+ is_categorical_dtype,
+ is_datetime64tz_dtype,
+ is_object_dtype,
+ needs_i8_conversion,
+ pandas_dtype,
+)
from pandas import ( # noqa:F401
- Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
- Index, Int64Index, Interval, IntervalIndex, MultiIndex, NaT, Period,
- PeriodIndex, RangeIndex, Series, TimedeltaIndex, Timestamp)
+ Categorical,
+ CategoricalIndex,
+ DataFrame,
+ DatetimeIndex,
+ Float64Index,
+ Index,
+ Int64Index,
+ Interval,
+ IntervalIndex,
+ MultiIndex,
+ NaT,
+ Period,
+ PeriodIndex,
+ RangeIndex,
+ Series,
+ TimedeltaIndex,
+ Timestamp,
+)
from pandas.core import internals
from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray
from pandas.core.arrays.sparse import BlockIndex, IntIndex
@@ -95,19 +116,22 @@ def to_msgpack(path_or_buf, *args, **kwargs):
compress : type of compressor (zlib or blosc), default to None (no
compression)
"""
- warnings.warn("to_msgpack is deprecated and will be removed in a "
- "future version.\n"
- "It is recommended to use pyarrow for on-the-wire "
- "transmission of pandas objects.",
- FutureWarning, stacklevel=3)
+ warnings.warn(
+ "to_msgpack is deprecated and will be removed in a "
+ "future version.\n"
+ "It is recommended to use pyarrow for on-the-wire "
+ "transmission of pandas objects.",
+ FutureWarning,
+ stacklevel=3,
+ )
global compressor
- compressor = kwargs.pop('compress', None)
- append = kwargs.pop('append', None)
+ compressor = kwargs.pop("compress", None)
+ append = kwargs.pop("append", None)
if append:
- mode = 'a+b'
+ mode = "a+b"
else:
- mode = 'wb'
+ mode = "wb"
def writer(fh):
for a in args:
@@ -125,7 +149,7 @@ def writer(fh):
writer(path_or_buf)
-def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
+def read_msgpack(path_or_buf, encoding="utf-8", iterator=False, **kwargs):
"""
Load msgpack pandas object from the specified
file path
@@ -152,11 +176,14 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
read_msgpack is only guaranteed to be backwards compatible to pandas
0.20.3.
"""
- warnings.warn("The read_msgpack is deprecated and will be removed in a "
- "future version.\n"
- "It is recommended to use pyarrow for on-the-wire "
- "transmission of pandas objects.",
- FutureWarning, stacklevel=3)
+ warnings.warn(
+ "The read_msgpack is deprecated and will be removed in a "
+ "future version.\n"
+ "It is recommended to use pyarrow for on-the-wire "
+ "transmission of pandas objects.",
+ FutureWarning,
+ stacklevel=3,
+ )
path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
if iterator:
@@ -182,7 +209,7 @@ def read(fh):
exists = False
if exists:
- with open(path_or_buf, 'rb') as fh:
+ with open(path_or_buf, "rb") as fh:
return read(fh)
if isinstance(path_or_buf, bytes):
@@ -194,25 +221,25 @@ def read(fh):
finally:
if fh is not None:
fh.close()
- elif hasattr(path_or_buf, 'read') and callable(path_or_buf.read):
+ elif hasattr(path_or_buf, "read") and callable(path_or_buf.read):
# treat as a buffer like
return read(path_or_buf)
- raise ValueError('path_or_buf needs to be a string file path or file-like')
+ raise ValueError("path_or_buf needs to be a string file path or file-like")
-dtype_dict = {21: np.dtype('M8[ns]'),
- 'datetime64[ns]': np.dtype('M8[ns]'),
- 'datetime64[us]': np.dtype('M8[us]'),
- 22: np.dtype('m8[ns]'),
- 'timedelta64[ns]': np.dtype('m8[ns]'),
- 'timedelta64[us]': np.dtype('m8[us]'),
-
- # this is platform int, which we need to remap to np.int64
- # for compat on windows platforms
- 7: np.dtype('int64'),
- 'category': 'category'
- }
+dtype_dict = {
+ 21: np.dtype("M8[ns]"),
+ "datetime64[ns]": np.dtype("M8[ns]"),
+ "datetime64[us]": np.dtype("M8[us]"),
+ 22: np.dtype("m8[ns]"),
+ "timedelta64[ns]": np.dtype("m8[ns]"),
+ "timedelta64[us]": np.dtype("m8[us]"),
+ # this is platform int, which we need to remap to np.int64
+ # for compat on windows platforms
+ 7: np.dtype("int64"),
+ "category": "category",
+}
def dtype_for(t):
@@ -222,13 +249,11 @@ def dtype_for(t):
return np.typeDict.get(t, t)
-c2f_dict = {'complex': np.float64,
- 'complex128': np.float64,
- 'complex64': np.float32}
+c2f_dict = {"complex": np.float64, "complex128": np.float64, "complex64": np.float32}
# windows (32 bit) compat
-if hasattr(np, 'float128'):
- c2f_dict['complex256'] = np.float128
+if hasattr(np, "float128"):
+ c2f_dict["complex256"] = np.float128
def c2f(r, i, ctype_name):
@@ -252,13 +277,12 @@ def convert(values):
return values.ravel().tolist()
if needs_i8_conversion(dtype):
- values = values.view('i8')
+ values = values.view("i8")
v = values.ravel()
- if compressor == 'zlib':
+ if compressor == "zlib":
zlib = import_optional_dependency(
- "zlib",
- extra="zlib is required when `compress='zlib'`."
+ "zlib", extra="zlib is required when `compress='zlib'`."
)
# return string arrays like they are
@@ -269,10 +293,9 @@ def convert(values):
v = v.tostring()
return ExtType(0, zlib.compress(v))
- elif compressor == 'blosc':
+ elif compressor == "blosc":
blosc = import_optional_dependency(
- "blosc",
- extra="zlib is required when `compress='blosc'`."
+ "blosc", extra="zlib is required when `compress='blosc'`."
)
# return string arrays like they are
@@ -303,19 +326,17 @@ def unconvert(values, dtype, compress=None):
dtype = pandas_dtype(dtype).base
if not as_is_ext:
- values = values.encode('latin1')
+ values = values.encode("latin1")
if compress:
- if compress == 'zlib':
+ if compress == "zlib":
zlib = import_optional_dependency(
- "zlib",
- extra="zlib is required when `compress='zlib'`."
+ "zlib", extra="zlib is required when `compress='zlib'`."
)
decompress = zlib.decompress
- elif compress == 'blosc':
+ elif compress == "blosc":
blosc = import_optional_dependency(
- "blosc",
- extra="zlib is required when `compress='blosc'`."
+ "blosc", extra="zlib is required when `compress='blosc'`."
)
decompress = blosc.decompress
else:
@@ -323,8 +344,7 @@ def unconvert(values, dtype, compress=None):
try:
return np.frombuffer(
- _move_into_mutable_buffer(decompress(values)),
- dtype=dtype,
+ _move_into_mutable_buffer(decompress(values)), dtype=dtype
)
except _BadMove as e:
# Pull the decompressed data off of the `_BadMove` exception.
@@ -338,8 +358,8 @@ def unconvert(values, dtype, compress=None):
# warn even though we need to make a copy because we are only
# copying at most 1 byte.
warnings.warn(
- 'copying data after decompressing; this may mean that'
- ' decompress is caching its result',
+ "copying data after decompressing; this may mean that"
+ " decompress is caching its result",
PerformanceWarning,
)
# fall through to copying `np.fromstring`
@@ -358,76 +378,87 @@ def encode(obj):
tobj = type(obj)
if isinstance(obj, Index):
if isinstance(obj, RangeIndex):
- return {'typ': 'range_index',
- 'klass': obj.__class__.__name__,
- 'name': getattr(obj, 'name', None),
- 'start': obj._range.start,
- 'stop': obj._range.stop,
- 'step': obj._range.step,
- }
+ return {
+ "typ": "range_index",
+ "klass": obj.__class__.__name__,
+ "name": getattr(obj, "name", None),
+ "start": obj._range.start,
+ "stop": obj._range.stop,
+ "step": obj._range.step,
+ }
elif isinstance(obj, PeriodIndex):
- return {'typ': 'period_index',
- 'klass': obj.__class__.__name__,
- 'name': getattr(obj, 'name', None),
- 'freq': getattr(obj, 'freqstr', None),
- 'dtype': obj.dtype.name,
- 'data': convert(obj.asi8),
- 'compress': compressor}
+ return {
+ "typ": "period_index",
+ "klass": obj.__class__.__name__,
+ "name": getattr(obj, "name", None),
+ "freq": getattr(obj, "freqstr", None),
+ "dtype": obj.dtype.name,
+ "data": convert(obj.asi8),
+ "compress": compressor,
+ }
elif isinstance(obj, DatetimeIndex):
- tz = getattr(obj, 'tz', None)
+ tz = getattr(obj, "tz", None)
# store tz info and data as UTC
if tz is not None:
tz = tz.zone
- obj = obj.tz_convert('UTC')
- return {'typ': 'datetime_index',
- 'klass': obj.__class__.__name__,
- 'name': getattr(obj, 'name', None),
- 'dtype': obj.dtype.name,
- 'data': convert(obj.asi8),
- 'freq': getattr(obj, 'freqstr', None),
- 'tz': tz,
- 'compress': compressor}
+ obj = obj.tz_convert("UTC")
+ return {
+ "typ": "datetime_index",
+ "klass": obj.__class__.__name__,
+ "name": getattr(obj, "name", None),
+ "dtype": obj.dtype.name,
+ "data": convert(obj.asi8),
+ "freq": getattr(obj, "freqstr", None),
+ "tz": tz,
+ "compress": compressor,
+ }
elif isinstance(obj, (IntervalIndex, IntervalArray)):
if isinstance(obj, IntervalIndex):
- typ = 'interval_index'
+ typ = "interval_index"
else:
- typ = 'interval_array'
- return {'typ': typ,
- 'klass': obj.__class__.__name__,
- 'name': getattr(obj, 'name', None),
- 'left': getattr(obj, 'left', None),
- 'right': getattr(obj, 'right', None),
- 'closed': getattr(obj, 'closed', None)}
+ typ = "interval_array"
+ return {
+ "typ": typ,
+ "klass": obj.__class__.__name__,
+ "name": getattr(obj, "name", None),
+ "left": getattr(obj, "left", None),
+ "right": getattr(obj, "right", None),
+ "closed": getattr(obj, "closed", None),
+ }
elif isinstance(obj, MultiIndex):
- return {'typ': 'multi_index',
- 'klass': obj.__class__.__name__,
- 'names': getattr(obj, 'names', None),
- 'dtype': obj.dtype.name,
- 'data': convert(obj.values),
- 'compress': compressor}
+ return {
+ "typ": "multi_index",
+ "klass": obj.__class__.__name__,
+ "names": getattr(obj, "names", None),
+ "dtype": obj.dtype.name,
+ "data": convert(obj.values),
+ "compress": compressor,
+ }
else:
- return {'typ': 'index',
- 'klass': obj.__class__.__name__,
- 'name': getattr(obj, 'name', None),
- 'dtype': obj.dtype.name,
- 'data': convert(obj.values),
- 'compress': compressor}
+ return {
+ "typ": "index",
+ "klass": obj.__class__.__name__,
+ "name": getattr(obj, "name", None),
+ "dtype": obj.dtype.name,
+ "data": convert(obj.values),
+ "compress": compressor,
+ }
elif isinstance(obj, Categorical):
- return {'typ': 'category',
- 'klass': obj.__class__.__name__,
- 'name': getattr(obj, 'name', None),
- 'codes': obj.codes,
- 'categories': obj.categories,
- 'ordered': obj.ordered,
- 'compress': compressor}
+ return {
+ "typ": "category",
+ "klass": obj.__class__.__name__,
+ "name": getattr(obj, "name", None),
+ "codes": obj.codes,
+ "categories": obj.categories,
+ "ordered": obj.ordered,
+ "compress": compressor,
+ }
elif isinstance(obj, Series):
if isinstance(obj, SparseSeries):
- raise NotImplementedError(
- 'msgpack sparse series is not implemented'
- )
+ raise NotImplementedError("msgpack sparse series is not implemented")
# d = {'typ': 'sparse_series',
# 'klass': obj.__class__.__name__,
# 'dtype': obj.dtype.name,
@@ -439,18 +470,18 @@ def encode(obj):
# d[f] = getattr(obj, f, None)
# return d
else:
- return {'typ': 'series',
- 'klass': obj.__class__.__name__,
- 'name': getattr(obj, 'name', None),
- 'index': obj.index,
- 'dtype': obj.dtype.name,
- 'data': convert(obj.values),
- 'compress': compressor}
+ return {
+ "typ": "series",
+ "klass": obj.__class__.__name__,
+ "name": getattr(obj, "name", None),
+ "index": obj.index,
+ "dtype": obj.dtype.name,
+ "data": convert(obj.values),
+ "compress": compressor,
+ }
elif issubclass(tobj, NDFrame):
if isinstance(obj, SparseDataFrame):
- raise NotImplementedError(
- 'msgpack sparse frame is not implemented'
- )
+ raise NotImplementedError("msgpack sparse frame is not implemented")
# d = {'typ': 'sparse_dataframe',
# 'klass': obj.__class__.__name__,
# 'columns': obj.columns}
@@ -466,19 +497,27 @@ def encode(obj):
data = data.consolidate()
# the block manager
- return {'typ': 'block_manager',
- 'klass': obj.__class__.__name__,
- 'axes': data.axes,
- 'blocks': [{'locs': b.mgr_locs.as_array,
- 'values': convert(b.values),
- 'shape': b.values.shape,
- 'dtype': b.dtype.name,
- 'klass': b.__class__.__name__,
- 'compress': compressor} for b in data.blocks]
+ return {
+ "typ": "block_manager",
+ "klass": obj.__class__.__name__,
+ "axes": data.axes,
+ "blocks": [
+ {
+ "locs": b.mgr_locs.as_array,
+ "values": convert(b.values),
+ "shape": b.values.shape,
+ "dtype": b.dtype.name,
+ "klass": b.__class__.__name__,
+ "compress": compressor,
}
-
- elif isinstance(obj, (datetime, date, np.datetime64, timedelta,
- np.timedelta64)) or obj is NaT:
+ for b in data.blocks
+ ],
+ }
+
+ elif (
+ isinstance(obj, (datetime, date, np.datetime64, timedelta, np.timedelta64))
+ or obj is NaT
+ ):
if isinstance(obj, Timestamp):
tz = obj.tzinfo
if tz is not None:
@@ -486,71 +525,73 @@ def encode(obj):
freq = obj.freq
if freq is not None:
freq = freq.freqstr
- return {'typ': 'timestamp',
- 'value': obj.value,
- 'freq': freq,
- 'tz': tz}
+ return {"typ": "timestamp", "value": obj.value, "freq": freq, "tz": tz}
if obj is NaT:
- return {'typ': 'nat'}
+ return {"typ": "nat"}
elif isinstance(obj, np.timedelta64):
- return {'typ': 'timedelta64',
- 'data': obj.view('i8')}
+ return {"typ": "timedelta64", "data": obj.view("i8")}
elif isinstance(obj, timedelta):
- return {'typ': 'timedelta',
- 'data': (obj.days, obj.seconds, obj.microseconds)}
+ return {
+ "typ": "timedelta",
+ "data": (obj.days, obj.seconds, obj.microseconds),
+ }
elif isinstance(obj, np.datetime64):
- return {'typ': 'datetime64',
- 'data': str(obj)}
+ return {"typ": "datetime64", "data": str(obj)}
elif isinstance(obj, datetime):
- return {'typ': 'datetime',
- 'data': obj.isoformat()}
+ return {"typ": "datetime", "data": obj.isoformat()}
elif isinstance(obj, date):
- return {'typ': 'date',
- 'data': obj.isoformat()}
- raise Exception(
- "cannot encode this datetimelike object: {obj}".format(obj=obj))
+ return {"typ": "date", "data": obj.isoformat()}
+ raise Exception("cannot encode this datetimelike object: {obj}".format(obj=obj))
elif isinstance(obj, Period):
- return {'typ': 'period',
- 'ordinal': obj.ordinal,
- 'freq': obj.freqstr}
+ return {"typ": "period", "ordinal": obj.ordinal, "freq": obj.freqstr}
elif isinstance(obj, Interval):
- return {'typ': 'interval',
- 'left': obj.left,
- 'right': obj.right,
- 'closed': obj.closed}
+ return {
+ "typ": "interval",
+ "left": obj.left,
+ "right": obj.right,
+ "closed": obj.closed,
+ }
elif isinstance(obj, BlockIndex):
- return {'typ': 'block_index',
- 'klass': obj.__class__.__name__,
- 'blocs': obj.blocs,
- 'blengths': obj.blengths,
- 'length': obj.length}
+ return {
+ "typ": "block_index",
+ "klass": obj.__class__.__name__,
+ "blocs": obj.blocs,
+ "blengths": obj.blengths,
+ "length": obj.length,
+ }
elif isinstance(obj, IntIndex):
- return {'typ': 'int_index',
- 'klass': obj.__class__.__name__,
- 'indices': obj.indices,
- 'length': obj.length}
+ return {
+ "typ": "int_index",
+ "klass": obj.__class__.__name__,
+ "indices": obj.indices,
+ "length": obj.length,
+ }
elif isinstance(obj, np.ndarray):
- return {'typ': 'ndarray',
- 'shape': obj.shape,
- 'ndim': obj.ndim,
- 'dtype': obj.dtype.name,
- 'data': convert(obj),
- 'compress': compressor}
+ return {
+ "typ": "ndarray",
+ "shape": obj.shape,
+ "ndim": obj.ndim,
+ "dtype": obj.dtype.name,
+ "data": convert(obj),
+ "compress": compressor,
+ }
elif isinstance(obj, np.number):
if np.iscomplexobj(obj):
- return {'typ': 'np_scalar',
- 'sub_typ': 'np_complex',
- 'dtype': obj.dtype.name,
- 'real': np.real(obj).__repr__(),
- 'imag': np.imag(obj).__repr__()}
+ return {
+ "typ": "np_scalar",
+ "sub_typ": "np_complex",
+ "dtype": obj.dtype.name,
+ "real": np.real(obj).__repr__(),
+ "imag": np.imag(obj).__repr__(),
+ }
else:
- return {'typ': 'np_scalar',
- 'dtype': obj.dtype.name,
- 'data': obj.__repr__()}
+ return {"typ": "np_scalar", "dtype": obj.dtype.name, "data": obj.__repr__()}
elif isinstance(obj, complex):
- return {'typ': 'np_complex',
- 'real': np.real(obj).__repr__(),
- 'imag': np.imag(obj).__repr__()}
+ return {
+ "typ": "np_complex",
+ "real": np.real(obj).__repr__(),
+ "imag": np.imag(obj).__repr__(),
+ }
return obj
@@ -560,105 +601,101 @@ def decode(obj):
Decoder for deserializing numpy data types.
"""
- typ = obj.get('typ')
+ typ = obj.get("typ")
if typ is None:
return obj
- elif typ == 'timestamp':
- freq = obj['freq'] if 'freq' in obj else obj['offset']
- return Timestamp(obj['value'], tz=obj['tz'], freq=freq)
- elif typ == 'nat':
+ elif typ == "timestamp":
+ freq = obj["freq"] if "freq" in obj else obj["offset"]
+ return Timestamp(obj["value"], tz=obj["tz"], freq=freq)
+ elif typ == "nat":
return NaT
- elif typ == 'period':
- return Period(ordinal=obj['ordinal'], freq=obj['freq'])
- elif typ == 'index':
- dtype = dtype_for(obj['dtype'])
- data = unconvert(obj['data'], dtype,
- obj.get('compress'))
- return Index(data, dtype=dtype, name=obj['name'])
- elif typ == 'range_index':
- return RangeIndex(obj['start'],
- obj['stop'],
- obj['step'],
- name=obj['name'])
- elif typ == 'multi_index':
- dtype = dtype_for(obj['dtype'])
- data = unconvert(obj['data'], dtype,
- obj.get('compress'))
+ elif typ == "period":
+ return Period(ordinal=obj["ordinal"], freq=obj["freq"])
+ elif typ == "index":
+ dtype = dtype_for(obj["dtype"])
+ data = unconvert(obj["data"], dtype, obj.get("compress"))
+ return Index(data, dtype=dtype, name=obj["name"])
+ elif typ == "range_index":
+ return RangeIndex(obj["start"], obj["stop"], obj["step"], name=obj["name"])
+ elif typ == "multi_index":
+ dtype = dtype_for(obj["dtype"])
+ data = unconvert(obj["data"], dtype, obj.get("compress"))
data = [tuple(x) for x in data]
- return MultiIndex.from_tuples(data, names=obj['names'])
- elif typ == 'period_index':
- data = unconvert(obj['data'], np.int64, obj.get('compress'))
- d = dict(name=obj['name'], freq=obj['freq'])
- freq = d.pop('freq', None)
+ return MultiIndex.from_tuples(data, names=obj["names"])
+ elif typ == "period_index":
+ data = unconvert(obj["data"], np.int64, obj.get("compress"))
+ d = dict(name=obj["name"], freq=obj["freq"])
+ freq = d.pop("freq", None)
return PeriodIndex(PeriodArray(data, freq), **d)
- elif typ == 'datetime_index':
- data = unconvert(obj['data'], np.int64, obj.get('compress'))
- d = dict(name=obj['name'], freq=obj['freq'])
+ elif typ == "datetime_index":
+ data = unconvert(obj["data"], np.int64, obj.get("compress"))
+ d = dict(name=obj["name"], freq=obj["freq"])
result = DatetimeIndex(data, **d)
- tz = obj['tz']
+ tz = obj["tz"]
# reverse tz conversion
if tz is not None:
- result = result.tz_localize('UTC').tz_convert(tz)
+ result = result.tz_localize("UTC").tz_convert(tz)
return result
- elif typ in ('interval_index', 'interval_array'):
- return globals()[obj['klass']].from_arrays(obj['left'],
- obj['right'],
- obj['closed'],
- name=obj['name'])
- elif typ == 'category':
- from_codes = globals()[obj['klass']].from_codes
- return from_codes(codes=obj['codes'],
- categories=obj['categories'],
- ordered=obj['ordered'])
-
- elif typ == 'interval':
- return Interval(obj['left'], obj['right'], obj['closed'])
- elif typ == 'series':
- dtype = dtype_for(obj['dtype'])
- index = obj['index']
- data = unconvert(obj['data'], dtype, obj['compress'])
- return Series(data, index=index, dtype=dtype, name=obj['name'])
-
- elif typ == 'block_manager':
- axes = obj['axes']
+ elif typ in ("interval_index", "interval_array"):
+ return globals()[obj["klass"]].from_arrays(
+ obj["left"], obj["right"], obj["closed"], name=obj["name"]
+ )
+ elif typ == "category":
+ from_codes = globals()[obj["klass"]].from_codes
+ return from_codes(
+ codes=obj["codes"], categories=obj["categories"], ordered=obj["ordered"]
+ )
+
+ elif typ == "interval":
+ return Interval(obj["left"], obj["right"], obj["closed"])
+ elif typ == "series":
+ dtype = dtype_for(obj["dtype"])
+ index = obj["index"]
+ data = unconvert(obj["data"], dtype, obj["compress"])
+ return Series(data, index=index, dtype=dtype, name=obj["name"])
+
+ elif typ == "block_manager":
+ axes = obj["axes"]
def create_block(b):
- values = _safe_reshape(unconvert(
- b['values'], dtype_for(b['dtype']),
- b['compress']), b['shape'])
+ values = _safe_reshape(
+ unconvert(b["values"], dtype_for(b["dtype"]), b["compress"]), b["shape"]
+ )
# locs handles duplicate column names, and should be used instead
# of items; see GH 9618
- if 'locs' in b:
- placement = b['locs']
+ if "locs" in b:
+ placement = b["locs"]
else:
- placement = axes[0].get_indexer(b['items'])
+ placement = axes[0].get_indexer(b["items"])
- if is_datetime64tz_dtype(b['dtype']):
+ if is_datetime64tz_dtype(b["dtype"]):
assert isinstance(values, np.ndarray), type(values)
- assert values.dtype == 'M8[ns]', values.dtype
- values = DatetimeArray(values, dtype=b['dtype'])
-
- return make_block(values=values,
- klass=getattr(internals, b['klass']),
- placement=placement,
- dtype=b['dtype'])
-
- blocks = [create_block(b) for b in obj['blocks']]
- return globals()[obj['klass']](BlockManager(blocks, axes))
- elif typ == 'datetime':
- return parse(obj['data'])
- elif typ == 'datetime64':
- return np.datetime64(parse(obj['data']))
- elif typ == 'date':
- return parse(obj['data']).date()
- elif typ == 'timedelta':
- return timedelta(*obj['data'])
- elif typ == 'timedelta64':
- return np.timedelta64(int(obj['data']))
+ assert values.dtype == "M8[ns]", values.dtype
+ values = DatetimeArray(values, dtype=b["dtype"])
+
+ return make_block(
+ values=values,
+ klass=getattr(internals, b["klass"]),
+ placement=placement,
+ dtype=b["dtype"],
+ )
+
+ blocks = [create_block(b) for b in obj["blocks"]]
+ return globals()[obj["klass"]](BlockManager(blocks, axes))
+ elif typ == "datetime":
+ return parse(obj["data"])
+ elif typ == "datetime64":
+ return np.datetime64(parse(obj["data"]))
+ elif typ == "date":
+ return parse(obj["data"]).date()
+ elif typ == "timedelta":
+ return timedelta(*obj["data"])
+ elif typ == "timedelta64":
+ return np.timedelta64(int(obj["data"]))
# elif typ == 'sparse_series':
# dtype = dtype_for(obj['dtype'])
# return SparseSeries(
@@ -671,94 +708,129 @@ def create_block(b):
# default_fill_value=obj['default_fill_value'],
# default_kind=obj['default_kind']
# )
- elif typ == 'block_index':
- return globals()[obj['klass']](obj['length'], obj['blocs'],
- obj['blengths'])
- elif typ == 'int_index':
- return globals()[obj['klass']](obj['length'], obj['indices'])
- elif typ == 'ndarray':
- return unconvert(obj['data'], np.typeDict[obj['dtype']],
- obj.get('compress')).reshape(obj['shape'])
- elif typ == 'np_scalar':
- if obj.get('sub_typ') == 'np_complex':
- return c2f(obj['real'], obj['imag'], obj['dtype'])
+ elif typ == "block_index":
+ return globals()[obj["klass"]](obj["length"], obj["blocs"], obj["blengths"])
+ elif typ == "int_index":
+ return globals()[obj["klass"]](obj["length"], obj["indices"])
+ elif typ == "ndarray":
+ return unconvert(
+ obj["data"], np.typeDict[obj["dtype"]], obj.get("compress")
+ ).reshape(obj["shape"])
+ elif typ == "np_scalar":
+ if obj.get("sub_typ") == "np_complex":
+ return c2f(obj["real"], obj["imag"], obj["dtype"])
else:
- dtype = dtype_for(obj['dtype'])
+ dtype = dtype_for(obj["dtype"])
try:
- return dtype(obj['data'])
+ return dtype(obj["data"])
except (ValueError, TypeError):
- return dtype.type(obj['data'])
- elif typ == 'np_complex':
- return complex(obj['real'] + '+' + obj['imag'] + 'j')
+ return dtype.type(obj["data"])
+ elif typ == "np_complex":
+ return complex(obj["real"] + "+" + obj["imag"] + "j")
elif isinstance(obj, (dict, list, set)):
return obj
else:
return obj
-def pack(o, default=encode,
- encoding='utf-8', unicode_errors='strict', use_single_float=False,
- autoreset=1, use_bin_type=1):
+def pack(
+ o,
+ default=encode,
+ encoding="utf-8",
+ unicode_errors="strict",
+ use_single_float=False,
+ autoreset=1,
+ use_bin_type=1,
+):
"""
Pack an object and return the packed bytes.
"""
- return Packer(default=default, encoding=encoding,
- unicode_errors=unicode_errors,
- use_single_float=use_single_float,
- autoreset=autoreset,
- use_bin_type=use_bin_type).pack(o)
-
-
-def unpack(packed, object_hook=decode,
- list_hook=None, use_list=False, encoding='utf-8',
- unicode_errors='strict', object_pairs_hook=None,
- max_buffer_size=0, ext_hook=ExtType):
+ return Packer(
+ default=default,
+ encoding=encoding,
+ unicode_errors=unicode_errors,
+ use_single_float=use_single_float,
+ autoreset=autoreset,
+ use_bin_type=use_bin_type,
+ ).pack(o)
+
+
+def unpack(
+ packed,
+ object_hook=decode,
+ list_hook=None,
+ use_list=False,
+ encoding="utf-8",
+ unicode_errors="strict",
+ object_pairs_hook=None,
+ max_buffer_size=0,
+ ext_hook=ExtType,
+):
"""
Unpack a packed object, return an iterator
Note: packed lists will be returned as tuples
"""
- return Unpacker(packed, object_hook=object_hook,
- list_hook=list_hook,
- use_list=use_list, encoding=encoding,
- unicode_errors=unicode_errors,
- object_pairs_hook=object_pairs_hook,
- max_buffer_size=max_buffer_size,
- ext_hook=ext_hook)
+ return Unpacker(
+ packed,
+ object_hook=object_hook,
+ list_hook=list_hook,
+ use_list=use_list,
+ encoding=encoding,
+ unicode_errors=unicode_errors,
+ object_pairs_hook=object_pairs_hook,
+ max_buffer_size=max_buffer_size,
+ ext_hook=ext_hook,
+ )
class Packer(_Packer):
-
- def __init__(self, default=encode,
- encoding='utf-8',
- unicode_errors='strict',
- use_single_float=False,
- autoreset=1,
- use_bin_type=1):
- super().__init__(default=default, encoding=encoding,
- unicode_errors=unicode_errors,
- use_single_float=use_single_float,
- autoreset=autoreset,
- use_bin_type=use_bin_type)
+ def __init__(
+ self,
+ default=encode,
+ encoding="utf-8",
+ unicode_errors="strict",
+ use_single_float=False,
+ autoreset=1,
+ use_bin_type=1,
+ ):
+ super().__init__(
+ default=default,
+ encoding=encoding,
+ unicode_errors=unicode_errors,
+ use_single_float=use_single_float,
+ autoreset=autoreset,
+ use_bin_type=use_bin_type,
+ )
class Unpacker(_Unpacker):
-
- def __init__(self, file_like=None, read_size=0, use_list=False,
- object_hook=decode,
- object_pairs_hook=None, list_hook=None, encoding='utf-8',
- unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
- super().__init__(file_like=file_like,
- read_size=read_size,
- use_list=use_list,
- object_hook=object_hook,
- object_pairs_hook=object_pairs_hook,
- list_hook=list_hook,
- encoding=encoding,
- unicode_errors=unicode_errors,
- max_buffer_size=max_buffer_size,
- ext_hook=ext_hook)
+ def __init__(
+ self,
+ file_like=None,
+ read_size=0,
+ use_list=False,
+ object_hook=decode,
+ object_pairs_hook=None,
+ list_hook=None,
+ encoding="utf-8",
+ unicode_errors="strict",
+ max_buffer_size=0,
+ ext_hook=ExtType,
+ ):
+ super().__init__(
+ file_like=file_like,
+ read_size=read_size,
+ use_list=use_list,
+ object_hook=object_hook,
+ object_pairs_hook=object_pairs_hook,
+ list_hook=list_hook,
+ encoding=encoding,
+ unicode_errors=unicode_errors,
+ max_buffer_size=max_buffer_size,
+ ext_hook=ext_hook,
+ )
class Iterator:
@@ -784,13 +856,13 @@ def __iter__(self):
path_exists = False
if path_exists:
- fh = open(self.path, 'rb')
+ fh = open(self.path, "rb")
else:
fh = BytesIO(self.path)
else:
- if not hasattr(self.path, 'read'):
+ if not hasattr(self.path, "read"):
fh = BytesIO(self.path)
else:
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index 9a846d1c7845c..3db05b94e5dce 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -13,10 +13,10 @@
def get_engine(engine):
""" return our implementation """
- if engine == 'auto':
- engine = get_option('io.parquet.engine')
+ if engine == "auto":
+ engine = get_option("io.parquet.engine")
- if engine == 'auto':
+ if engine == "auto":
# try engines in this order
try:
return PyArrowImpl()
@@ -28,17 +28,19 @@ def get_engine(engine):
except ImportError:
pass
- raise ImportError("Unable to find a usable engine; "
- "tried using: 'pyarrow', 'fastparquet'.\n"
- "pyarrow or fastparquet is required for parquet "
- "support")
+ raise ImportError(
+ "Unable to find a usable engine; "
+ "tried using: 'pyarrow', 'fastparquet'.\n"
+ "pyarrow or fastparquet is required for parquet "
+ "support"
+ )
- if engine not in ['pyarrow', 'fastparquet']:
+ if engine not in ["pyarrow", "fastparquet"]:
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
- if engine == 'pyarrow':
+ if engine == "pyarrow":
return PyArrowImpl()
- elif engine == 'fastparquet':
+ elif engine == "fastparquet":
return FastParquetImpl()
@@ -53,14 +55,12 @@ def validate_dataframe(df):
raise ValueError("to_parquet only supports IO with DataFrames")
# must have value column names (strings only)
- if df.columns.inferred_type not in {'string', 'unicode'}:
+ if df.columns.inferred_type not in {"string", "unicode"}:
raise ValueError("parquet must have string column names")
# index level names must be strings
valid_names = all(
- isinstance(name, str)
- for name in df.index.names
- if name is not None
+ isinstance(name, str) for name in df.index.names if name is not None
)
if not valid_names:
raise ValueError("Index level names must be strings")
@@ -73,42 +73,57 @@ def read(self, path, columns=None, **kwargs):
class PyArrowImpl(BaseImpl):
-
def __init__(self):
pyarrow = import_optional_dependency(
- "pyarrow",
- extra="pyarrow is required for parquet support."
+ "pyarrow", extra="pyarrow is required for parquet support."
)
import pyarrow.parquet
+
self.api = pyarrow
- def write(self, df, path, compression='snappy',
- coerce_timestamps='ms', index=None, partition_cols=None,
- **kwargs):
+ def write(
+ self,
+ df,
+ path,
+ compression="snappy",
+ coerce_timestamps="ms",
+ index=None,
+ partition_cols=None,
+ **kwargs
+ ):
self.validate_dataframe(df)
- path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
+ path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
if index is None:
from_pandas_kwargs = {}
else:
- from_pandas_kwargs = {'preserve_index': index}
+ from_pandas_kwargs = {"preserve_index": index}
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
if partition_cols is not None:
self.api.parquet.write_to_dataset(
- table, path, compression=compression,
+ table,
+ path,
+ compression=compression,
coerce_timestamps=coerce_timestamps,
- partition_cols=partition_cols, **kwargs)
+ partition_cols=partition_cols,
+ **kwargs
+ )
else:
self.api.parquet.write_table(
- table, path, compression=compression,
- coerce_timestamps=coerce_timestamps, **kwargs)
+ table,
+ path,
+ compression=compression,
+ coerce_timestamps=coerce_timestamps,
+ **kwargs
+ )
def read(self, path, columns=None, **kwargs):
path, _, _, should_close = get_filepath_or_buffer(path)
- kwargs['use_pandas_metadata'] = True
- result = self.api.parquet.read_table(path, columns=columns,
- **kwargs).to_pandas()
+ kwargs["use_pandas_metadata"] = True
+ result = self.api.parquet.read_table(
+ path, columns=columns, **kwargs
+ ).to_pandas()
if should_close:
try:
path.close()
@@ -119,47 +134,53 @@ def read(self, path, columns=None, **kwargs):
class FastParquetImpl(BaseImpl):
-
def __init__(self):
# since pandas is a dependency of fastparquet
# we need to import on first use
fastparquet = import_optional_dependency(
- "fastparquet",
- extra="fastparquet is required for parquet support."
+ "fastparquet", extra="fastparquet is required for parquet support."
)
self.api = fastparquet
- def write(self, df, path, compression='snappy', index=None,
- partition_cols=None, **kwargs):
+ def write(
+ self, df, path, compression="snappy", index=None, partition_cols=None, **kwargs
+ ):
self.validate_dataframe(df)
# thriftpy/protocol/compact.py:339:
# DeprecationWarning: tostring() is deprecated.
# Use tobytes() instead.
- if 'partition_on' in kwargs and partition_cols is not None:
- raise ValueError("Cannot use both partition_on and "
- "partition_cols. Use partition_cols for "
- "partitioning data")
- elif 'partition_on' in kwargs:
- partition_cols = kwargs.pop('partition_on')
+ if "partition_on" in kwargs and partition_cols is not None:
+ raise ValueError(
+ "Cannot use both partition_on and "
+ "partition_cols. Use partition_cols for "
+ "partitioning data"
+ )
+ elif "partition_on" in kwargs:
+ partition_cols = kwargs.pop("partition_on")
if partition_cols is not None:
- kwargs['file_scheme'] = 'hive'
+ kwargs["file_scheme"] = "hive"
if is_s3_url(path):
# path is s3:// so we need to open the s3file in 'wb' mode.
# TODO: Support 'ab'
- path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
+ path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
# And pass the opened s3file to the fastparquet internal impl.
- kwargs['open_with'] = lambda path, _: path
+ kwargs["open_with"] = lambda path, _: path
else:
path, _, _, _ = get_filepath_or_buffer(path)
with catch_warnings(record=True):
- self.api.write(path, df, compression=compression,
- write_index=index, partition_on=partition_cols,
- **kwargs)
+ self.api.write(
+ path,
+ df,
+ compression=compression,
+ write_index=index,
+ partition_on=partition_cols,
+ **kwargs
+ )
def read(self, path, columns=None, **kwargs):
if is_s3_url(path):
@@ -178,8 +199,15 @@ def read(self, path, columns=None, **kwargs):
return parquet_file.to_pandas(columns=columns, **kwargs)
-def to_parquet(df, path, engine='auto', compression='snappy', index=None,
- partition_cols=None, **kwargs):
+def to_parquet(
+ df,
+ path,
+ engine="auto",
+ compression="snappy",
+ index=None,
+ partition_cols=None,
+ **kwargs
+):
"""
Write a DataFrame to the parquet format.
@@ -215,11 +243,17 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=None,
Additional keyword arguments passed to the engine
"""
impl = get_engine(engine)
- return impl.write(df, path, compression=compression, index=index,
- partition_cols=partition_cols, **kwargs)
+ return impl.write(
+ df,
+ path,
+ compression=compression,
+ index=index,
+ partition_cols=partition_cols,
+ **kwargs
+ )
-def read_parquet(path, engine='auto', columns=None, **kwargs):
+def read_parquet(path, engine="auto", columns=None, **kwargs):
"""
Load a parquet object from the file path, returning a DataFrame.
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 8fe0e466e7c0a..78440939ebc01 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -19,15 +19,30 @@
import pandas._libs.parsers as parsers
from pandas._libs.tslibs import parsing
from pandas.errors import (
- AbstractMethodError, EmptyDataError, ParserError, ParserWarning)
+ AbstractMethodError,
+ EmptyDataError,
+ ParserError,
+ ParserWarning,
+)
from pandas.util._decorators import Appender
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
- ensure_object, ensure_str, is_bool_dtype, is_categorical_dtype,
- is_dtype_equal, is_extension_array_dtype, is_float, is_integer,
- is_integer_dtype, is_list_like, is_object_dtype, is_scalar,
- is_string_dtype, pandas_dtype)
+ ensure_object,
+ ensure_str,
+ is_bool_dtype,
+ is_categorical_dtype,
+ is_dtype_equal,
+ is_extension_array_dtype,
+ is_float,
+ is_integer,
+ is_integer_dtype,
+ is_list_like,
+ is_object_dtype,
+ is_scalar,
+ is_string_dtype,
+ pandas_dtype,
+)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import isna
@@ -35,24 +50,31 @@
from pandas.core import algorithms
from pandas.core.arrays import Categorical
from pandas.core.frame import DataFrame
-from pandas.core.index import (
- Index, MultiIndex, RangeIndex, ensure_index_from_sequences)
+from pandas.core.index import Index, MultiIndex, RangeIndex, ensure_index_from_sequences
from pandas.core.series import Series
from pandas.core.tools import datetimes as tools
from pandas.io.common import (
- _NA_VALUES, BaseIterator, UnicodeReader, UTF8Recoder, _get_handle,
- _infer_compression, _validate_header_arg, get_filepath_or_buffer,
- is_file_like)
+ _NA_VALUES,
+ BaseIterator,
+ UnicodeReader,
+ UTF8Recoder,
+ _get_handle,
+ _infer_compression,
+ _validate_header_arg,
+ get_filepath_or_buffer,
+ is_file_like,
+)
from pandas.io.date_converters import generic_parser
# BOM character (byte order mark)
# This exists at the beginning of a file to indicate endianness
# of a file (stream). Unfortunately, this marker screws up parsing,
# so we need to remove it if we see it.
-_BOM = '\ufeff'
+_BOM = "\ufeff"
-_doc_read_csv_and_table = r"""
+_doc_read_csv_and_table = (
+ r"""
{summary}
Also supports optionally iterating or breaking of the file
@@ -168,8 +190,9 @@
na_values : scalar, str, list-like, or dict, optional
Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values. By default the following values are interpreted as
- NaN: '""" + fill("', '".join(sorted(_NA_VALUES)),
- 70, subsequent_indent=" ") + """'.
+ NaN: '"""
+ + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ")
+ + """'.
keep_default_na : bool, default True
Whether or not to include the default NaN values when parsing the data.
Depending on whether `na_values` is passed in, the behavior is as follows:
@@ -343,6 +366,7 @@
--------
>>> pd.{func_name}('data.csv') # doctest: +SKIP
"""
+)
def _validate_integer(name, val, min_val=0):
@@ -361,8 +385,9 @@ def _validate_integer(name, val, min_val=0):
min_val : int
Minimum allowed value (val < min_val will result in a ValueError)
"""
- msg = "'{name:s}' must be an integer >={min_val:d}".format(name=name,
- min_val=min_val)
+ msg = "'{name:s}' must be an integer >={min_val:d}".format(
+ name=name, min_val=min_val
+ )
if val is not None:
if is_float(val):
@@ -394,18 +419,18 @@ def _validate_names(names):
if names is not None:
if len(names) != len(set(names)):
- raise ValueError('Duplicate names are not allowed.')
+ raise ValueError("Duplicate names are not allowed.")
return names
def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
"""Generic reader of line files."""
- encoding = kwds.get('encoding', None)
+ encoding = kwds.get("encoding", None)
if encoding is not None:
- encoding = re.sub('_', '-', encoding).lower()
- kwds['encoding'] = encoding
+ encoding = re.sub("_", "-", encoding).lower()
+ kwds["encoding"] = encoding
- compression = kwds.get('compression', 'infer')
+ compression = kwds.get("compression", "infer")
compression = _infer_compression(filepath_or_buffer, compression)
# TODO: get_filepath_or_buffer could return
@@ -413,17 +438,18 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
# though mypy handling of conditional imports is difficult.
# See https://github.com/python/mypy/issues/1297
fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
- filepath_or_buffer, encoding, compression)
- kwds['compression'] = compression
+ filepath_or_buffer, encoding, compression
+ )
+ kwds["compression"] = compression
- if kwds.get('date_parser', None) is not None:
- if isinstance(kwds['parse_dates'], bool):
- kwds['parse_dates'] = True
+ if kwds.get("date_parser", None) is not None:
+ if isinstance(kwds["parse_dates"], bool):
+ kwds["parse_dates"] = True
# Extract some of the arguments (pass chunksize on).
- iterator = kwds.get('iterator', False)
- chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
- nrows = kwds.get('nrows', None)
+ iterator = kwds.get("iterator", False)
+ chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
+ nrows = kwds.get("nrows", None)
# Check for duplicates in names.
_validate_names(kwds.get("names", None))
@@ -449,147 +475,127 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
_parser_defaults = {
- 'delimiter': None,
-
- 'escapechar': None,
- 'quotechar': '"',
- 'quoting': csv.QUOTE_MINIMAL,
- 'doublequote': True,
- 'skipinitialspace': False,
- 'lineterminator': None,
-
- 'header': 'infer',
- 'index_col': None,
- 'names': None,
- 'prefix': None,
- 'skiprows': None,
- 'skipfooter': 0,
- 'nrows': None,
- 'na_values': None,
- 'keep_default_na': True,
-
- 'true_values': None,
- 'false_values': None,
- 'converters': None,
- 'dtype': None,
- 'cache_dates': True,
-
- 'thousands': None,
- 'comment': None,
- 'decimal': b'.',
-
+ "delimiter": None,
+ "escapechar": None,
+ "quotechar": '"',
+ "quoting": csv.QUOTE_MINIMAL,
+ "doublequote": True,
+ "skipinitialspace": False,
+ "lineterminator": None,
+ "header": "infer",
+ "index_col": None,
+ "names": None,
+ "prefix": None,
+ "skiprows": None,
+ "skipfooter": 0,
+ "nrows": None,
+ "na_values": None,
+ "keep_default_na": True,
+ "true_values": None,
+ "false_values": None,
+ "converters": None,
+ "dtype": None,
+ "cache_dates": True,
+ "thousands": None,
+ "comment": None,
+ "decimal": b".",
# 'engine': 'c',
- 'parse_dates': False,
- 'keep_date_col': False,
- 'dayfirst': False,
- 'date_parser': None,
- 'usecols': None,
-
+ "parse_dates": False,
+ "keep_date_col": False,
+ "dayfirst": False,
+ "date_parser": None,
+ "usecols": None,
# 'iterator': False,
- 'chunksize': None,
- 'verbose': False,
- 'encoding': None,
- 'squeeze': False,
- 'compression': None,
- 'mangle_dupe_cols': True,
- 'infer_datetime_format': False,
- 'skip_blank_lines': True
+ "chunksize": None,
+ "verbose": False,
+ "encoding": None,
+ "squeeze": False,
+ "compression": None,
+ "mangle_dupe_cols": True,
+ "infer_datetime_format": False,
+ "skip_blank_lines": True,
}
_c_parser_defaults = {
- 'delim_whitespace': False,
- 'na_filter': True,
- 'low_memory': True,
- 'memory_map': False,
- 'error_bad_lines': True,
- 'warn_bad_lines': True,
- 'float_precision': None
+ "delim_whitespace": False,
+ "na_filter": True,
+ "low_memory": True,
+ "memory_map": False,
+ "error_bad_lines": True,
+ "warn_bad_lines": True,
+ "float_precision": None,
}
-_fwf_defaults = {
- 'colspecs': 'infer',
- 'infer_nrows': 100,
- 'widths': None,
-}
+_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
-_c_unsupported = {'skipfooter'}
-_python_unsupported = {
- 'low_memory',
- 'float_precision',
-}
+_c_unsupported = {"skipfooter"}
+_python_unsupported = {"low_memory", "float_precision"}
_deprecated_defaults = {} # type: Dict[str, Any]
_deprecated_args = set() # type: Set[str]
-def _make_parser_function(name, default_sep=','):
-
- def parser_f(filepath_or_buffer: FilePathOrBuffer,
- sep=default_sep,
- delimiter=None,
-
- # Column and Index Locations and Names
- header='infer',
- names=None,
- index_col=None,
- usecols=None,
- squeeze=False,
- prefix=None,
- mangle_dupe_cols=True,
-
- # General Parsing Configuration
- dtype=None,
- engine=None,
- converters=None,
- true_values=None,
- false_values=None,
- skipinitialspace=False,
- skiprows=None,
- skipfooter=0,
- nrows=None,
-
- # NA and Missing Data Handling
- na_values=None,
- keep_default_na=True,
- na_filter=True,
- verbose=False,
- skip_blank_lines=True,
-
- # Datetime Handling
- parse_dates=False,
- infer_datetime_format=False,
- keep_date_col=False,
- date_parser=None,
- dayfirst=False,
- cache_dates=True,
-
- # Iteration
- iterator=False,
- chunksize=None,
-
- # Quoting, Compression, and File Format
- compression='infer',
- thousands=None,
- decimal=b'.',
- lineterminator=None,
- quotechar='"',
- quoting=csv.QUOTE_MINIMAL,
- doublequote=True,
- escapechar=None,
- comment=None,
- encoding=None,
- dialect=None,
-
- # Error Handling
- error_bad_lines=True,
- warn_bad_lines=True,
-
- # Internal
- delim_whitespace=False,
- low_memory=_c_parser_defaults['low_memory'],
- memory_map=False,
- float_precision=None):
+def _make_parser_function(name, default_sep=","):
+ def parser_f(
+ filepath_or_buffer: FilePathOrBuffer,
+ sep=default_sep,
+ delimiter=None,
+ # Column and Index Locations and Names
+ header="infer",
+ names=None,
+ index_col=None,
+ usecols=None,
+ squeeze=False,
+ prefix=None,
+ mangle_dupe_cols=True,
+ # General Parsing Configuration
+ dtype=None,
+ engine=None,
+ converters=None,
+ true_values=None,
+ false_values=None,
+ skipinitialspace=False,
+ skiprows=None,
+ skipfooter=0,
+ nrows=None,
+ # NA and Missing Data Handling
+ na_values=None,
+ keep_default_na=True,
+ na_filter=True,
+ verbose=False,
+ skip_blank_lines=True,
+ # Datetime Handling
+ parse_dates=False,
+ infer_datetime_format=False,
+ keep_date_col=False,
+ date_parser=None,
+ dayfirst=False,
+ cache_dates=True,
+ # Iteration
+ iterator=False,
+ chunksize=None,
+ # Quoting, Compression, and File Format
+ compression="infer",
+ thousands=None,
+ decimal=b".",
+ lineterminator=None,
+ quotechar='"',
+ quoting=csv.QUOTE_MINIMAL,
+ doublequote=True,
+ escapechar=None,
+ comment=None,
+ encoding=None,
+ dialect=None,
+ # Error Handling
+ error_bad_lines=True,
+ warn_bad_lines=True,
+ # Internal
+ delim_whitespace=False,
+ low_memory=_c_parser_defaults["low_memory"],
+ memory_map=False,
+ float_precision=None,
+ ):
# gh-23761
#
@@ -614,69 +620,68 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
delimiter = sep
if delim_whitespace and delimiter != default_sep:
- raise ValueError("Specified a delimiter with both sep and"
- " delim_whitespace=True; you can only"
- " specify one.")
+ raise ValueError(
+ "Specified a delimiter with both sep and"
+ " delim_whitespace=True; you can only"
+ " specify one."
+ )
if engine is not None:
engine_specified = True
else:
- engine = 'c'
+ engine = "c"
engine_specified = False
- kwds.update(delimiter=delimiter,
- engine=engine,
- dialect=dialect,
- compression=compression,
- engine_specified=engine_specified,
-
- doublequote=doublequote,
- escapechar=escapechar,
- quotechar=quotechar,
- quoting=quoting,
- skipinitialspace=skipinitialspace,
- lineterminator=lineterminator,
-
- header=header,
- index_col=index_col,
- names=names,
- prefix=prefix,
- skiprows=skiprows,
- skipfooter=skipfooter,
- na_values=na_values,
- true_values=true_values,
- false_values=false_values,
- keep_default_na=keep_default_na,
- thousands=thousands,
- comment=comment,
- decimal=decimal,
-
- parse_dates=parse_dates,
- keep_date_col=keep_date_col,
- dayfirst=dayfirst,
- date_parser=date_parser,
- cache_dates=cache_dates,
-
- nrows=nrows,
- iterator=iterator,
- chunksize=chunksize,
- converters=converters,
- dtype=dtype,
- usecols=usecols,
- verbose=verbose,
- encoding=encoding,
- squeeze=squeeze,
- memory_map=memory_map,
- float_precision=float_precision,
-
- na_filter=na_filter,
- delim_whitespace=delim_whitespace,
- warn_bad_lines=warn_bad_lines,
- error_bad_lines=error_bad_lines,
- low_memory=low_memory,
- mangle_dupe_cols=mangle_dupe_cols,
- infer_datetime_format=infer_datetime_format,
- skip_blank_lines=skip_blank_lines)
+ kwds.update(
+ delimiter=delimiter,
+ engine=engine,
+ dialect=dialect,
+ compression=compression,
+ engine_specified=engine_specified,
+ doublequote=doublequote,
+ escapechar=escapechar,
+ quotechar=quotechar,
+ quoting=quoting,
+ skipinitialspace=skipinitialspace,
+ lineterminator=lineterminator,
+ header=header,
+ index_col=index_col,
+ names=names,
+ prefix=prefix,
+ skiprows=skiprows,
+ skipfooter=skipfooter,
+ na_values=na_values,
+ true_values=true_values,
+ false_values=false_values,
+ keep_default_na=keep_default_na,
+ thousands=thousands,
+ comment=comment,
+ decimal=decimal,
+ parse_dates=parse_dates,
+ keep_date_col=keep_date_col,
+ dayfirst=dayfirst,
+ date_parser=date_parser,
+ cache_dates=cache_dates,
+ nrows=nrows,
+ iterator=iterator,
+ chunksize=chunksize,
+ converters=converters,
+ dtype=dtype,
+ usecols=usecols,
+ verbose=verbose,
+ encoding=encoding,
+ squeeze=squeeze,
+ memory_map=memory_map,
+ float_precision=float_precision,
+ na_filter=na_filter,
+ delim_whitespace=delim_whitespace,
+ warn_bad_lines=warn_bad_lines,
+ error_bad_lines=error_bad_lines,
+ low_memory=low_memory,
+ mangle_dupe_cols=mangle_dupe_cols,
+ infer_datetime_format=infer_datetime_format,
+ skip_blank_lines=skip_blank_lines,
+ )
return _read(filepath_or_buffer, kwds)
@@ -685,27 +690,32 @@ def parser_f(filepath_or_buffer: FilePathOrBuffer,
return parser_f
-read_csv = _make_parser_function('read_csv', default_sep=',')
-read_csv = Appender(_doc_read_csv_and_table.format(
- func_name='read_csv',
- summary=('Read a comma-separated values (csv) file '
- 'into DataFrame.'),
- _default_sep="','")
- )(read_csv)
-
-read_table = _make_parser_function('read_table', default_sep='\t')
-read_table = Appender(_doc_read_csv_and_table.format(
- func_name='read_table',
- summary='Read general delimited file into DataFrame.',
- _default_sep=r"'\\t' (tab-stop)")
- )(read_table)
-
-
-def read_fwf(filepath_or_buffer: FilePathOrBuffer,
- colspecs='infer',
- widths=None,
- infer_nrows=100,
- **kwds):
+read_csv = _make_parser_function("read_csv", default_sep=",")
+read_csv = Appender(
+ _doc_read_csv_and_table.format(
+ func_name="read_csv",
+ summary=("Read a comma-separated values (csv) file " "into DataFrame."),
+ _default_sep="','",
+ )
+)(read_csv)
+
+read_table = _make_parser_function("read_table", default_sep="\t")
+read_table = Appender(
+ _doc_read_csv_and_table.format(
+ func_name="read_table",
+ summary="Read general delimited file into DataFrame.",
+ _default_sep=r"'\\t' (tab-stop)",
+ )
+)(read_table)
+
+
+def read_fwf(
+ filepath_or_buffer: FilePathOrBuffer,
+ colspecs="infer",
+ widths=None,
+ infer_nrows=100,
+ **kwds
+):
r"""
Read a table of fixed-width formatted lines into DataFrame.
@@ -765,9 +775,8 @@ def read_fwf(filepath_or_buffer: FilePathOrBuffer,
# Check input arguments.
if colspecs is None and widths is None:
raise ValueError("Must specify either colspecs or widths")
- elif colspecs not in (None, 'infer') and widths is not None:
- raise ValueError("You must specify only one of 'widths' and "
- "'colspecs'")
+ elif colspecs not in (None, "infer") and widths is not None:
+ raise ValueError("You must specify only one of 'widths' and " "'colspecs'")
# Compute 'colspecs' from 'widths', if specified.
if widths is not None:
@@ -776,9 +785,9 @@ def read_fwf(filepath_or_buffer: FilePathOrBuffer,
colspecs.append((col, col + w))
col += w
- kwds['colspecs'] = colspecs
- kwds['infer_nrows'] = infer_nrows
- kwds['engine'] = 'python-fwf'
+ kwds["colspecs"] = colspecs
+ kwds["infer_nrows"] = infer_nrows
+ kwds["engine"] = "python-fwf"
return _read(filepath_or_buffer, kwds)
@@ -796,25 +805,34 @@ def __init__(self, f, engine=None, **kwds):
if engine is not None:
engine_specified = True
else:
- engine = 'python'
+ engine = "python"
engine_specified = False
- self._engine_specified = kwds.get('engine_specified', engine_specified)
+ self._engine_specified = kwds.get("engine_specified", engine_specified)
- if kwds.get('dialect') is not None:
- dialect = kwds['dialect']
+ if kwds.get("dialect") is not None:
+ dialect = kwds["dialect"]
if dialect in csv.list_dialects():
dialect = csv.get_dialect(dialect)
# Any valid dialect should have these attributes.
# If any are missing, we will raise automatically.
- for param in ('delimiter', 'doublequote', 'escapechar',
- 'skipinitialspace', 'quotechar', 'quoting'):
+ for param in (
+ "delimiter",
+ "doublequote",
+ "escapechar",
+ "skipinitialspace",
+ "quotechar",
+ "quoting",
+ ):
try:
dialect_val = getattr(dialect, param)
except AttributeError:
- raise ValueError("Invalid dialect '{dialect}' provided"
- .format(dialect=kwds['dialect']))
+ raise ValueError(
+ "Invalid dialect '{dialect}' provided".format(
+ dialect=kwds["dialect"]
+ )
+ )
parser_default = _parser_defaults[param]
provided = kwds.get(param, parser_default)
@@ -825,21 +843,24 @@ def __init__(self, f, engine=None, **kwds):
# Don't warn if the default parameter was passed in,
# even if it conflicts with the dialect (gh-23761).
if provided != parser_default and provided != dialect_val:
- msg = ("Conflicting values for '{param}': '{val}' was "
- "provided, but the dialect specifies '{diaval}'. "
- "Using the dialect-specified value.".format(
- param=param, val=provided, diaval=dialect_val))
+ msg = (
+ "Conflicting values for '{param}': '{val}' was "
+ "provided, but the dialect specifies '{diaval}'. "
+ "Using the dialect-specified value.".format(
+ param=param, val=provided, diaval=dialect_val
+ )
+ )
# Annoying corner case for not warning about
# conflicts between dialect and delimiter parameter.
# Refer to the outer "_read_" function for more info.
- if not (param == "delimiter" and
- kwds.pop("sep_override", False)):
+ if not (param == "delimiter" and kwds.pop("sep_override", False)):
conflict_msgs.append(msg)
if conflict_msgs:
- warnings.warn('\n\n'.join(conflict_msgs), ParserWarning,
- stacklevel=2)
+ warnings.warn(
+ "\n\n".join(conflict_msgs), ParserWarning, stacklevel=2
+ )
kwds[param] = dialect_val
if kwds.get("skipfooter"):
@@ -848,8 +869,8 @@ def __init__(self, f, engine=None, **kwds):
if kwds.get("nrows"):
raise ValueError("'skipfooter' not supported with 'nrows'")
- if kwds.get('header', 'infer') == 'infer':
- kwds['header'] = 0 if kwds.get('names') is None else None
+ if kwds.get("header", "infer") == "infer":
+ kwds["header"] = 0 if kwds.get("names") is None else None
self.orig_options = kwds
@@ -860,16 +881,16 @@ def __init__(self, f, engine=None, **kwds):
options = self._get_options_with_defaults(engine)
- self.chunksize = options.pop('chunksize', None)
- self.nrows = options.pop('nrows', None)
- self.squeeze = options.pop('squeeze', False)
+ self.chunksize = options.pop("chunksize", None)
+ self.nrows = options.pop("nrows", None)
+ self.squeeze = options.pop("squeeze", False)
# might mutate self.engine
self.engine = self._check_file_or_buffer(f, engine)
self.options, self.engine = self._clean_options(options, engine)
- if 'has_index_names' in kwds:
- self.options['has_index_names'] = kwds['has_index_names']
+ if "has_index_names" in kwds:
+ self.options["has_index_names"] = kwds["has_index_names"]
self._make_engine(self.engine)
@@ -885,9 +906,10 @@ def _get_options_with_defaults(self, engine):
value = kwds.get(argname, default)
# see gh-12935
- if argname == 'mangle_dupe_cols' and not value:
- raise ValueError('Setting mangle_dupe_cols=False is '
- 'not supported yet')
+ if argname == "mangle_dupe_cols" and not value:
+ raise ValueError(
+ "Setting mangle_dupe_cols=False is " "not supported yet"
+ )
else:
options[argname] = value
@@ -895,21 +917,21 @@ def _get_options_with_defaults(self, engine):
if argname in kwds:
value = kwds[argname]
- if engine != 'c' and value != default:
- if ('python' in engine and
- argname not in _python_unsupported):
+ if engine != "c" and value != default:
+ if "python" in engine and argname not in _python_unsupported:
pass
elif value == _deprecated_defaults.get(argname, default):
pass
else:
raise ValueError(
- 'The %r option is not supported with the'
- ' %r engine' % (argname, engine))
+ "The %r option is not supported with the"
+ " %r engine" % (argname, engine)
+ )
else:
value = _deprecated_defaults.get(argname, default)
options[argname] = value
- if engine == 'python-fwf':
+ if engine == "python-fwf":
for argname, default in _fwf_defaults.items():
options[argname] = kwds.get(argname, default)
@@ -926,8 +948,7 @@ def _check_file_or_buffer(self, f, engine):
# needs to have that attribute ("next" for Python 2.x, "__next__"
# for Python 3.x)
if engine != "c" and not hasattr(f, next_attr):
- msg = ("The 'python' engine cannot iterate "
- "through this file buffer.")
+ msg = "The 'python' engine cannot iterate " "through this file buffer."
raise ValueError(msg)
return engine
@@ -938,36 +959,39 @@ def _clean_options(self, options, engine):
engine_specified = self._engine_specified
fallback_reason = None
- sep = options['delimiter']
- delim_whitespace = options['delim_whitespace']
+ sep = options["delimiter"]
+ delim_whitespace = options["delim_whitespace"]
# C engine not supported yet
- if engine == 'c':
- if options['skipfooter'] > 0:
- fallback_reason = ("the 'c' engine does not support"
- " skipfooter")
- engine = 'python'
+ if engine == "c":
+ if options["skipfooter"] > 0:
+ fallback_reason = "the 'c' engine does not support" " skipfooter"
+ engine = "python"
- encoding = sys.getfilesystemencoding() or 'utf-8'
+ encoding = sys.getfilesystemencoding() or "utf-8"
if sep is None and not delim_whitespace:
- if engine == 'c':
- fallback_reason = ("the 'c' engine does not support"
- " sep=None with delim_whitespace=False")
- engine = 'python'
+ if engine == "c":
+ fallback_reason = (
+ "the 'c' engine does not support"
+ " sep=None with delim_whitespace=False"
+ )
+ engine = "python"
elif sep is not None and len(sep) > 1:
- if engine == 'c' and sep == r'\s+':
- result['delim_whitespace'] = True
- del result['delimiter']
- elif engine not in ('python', 'python-fwf'):
+ if engine == "c" and sep == r"\s+":
+ result["delim_whitespace"] = True
+ del result["delimiter"]
+ elif engine not in ("python", "python-fwf"):
# wait until regex engine integrated
- fallback_reason = ("the 'c' engine does not support"
- " regex separators (separators > 1 char and"
- r" different from '\s+' are"
- " interpreted as regex)")
- engine = 'python'
+ fallback_reason = (
+ "the 'c' engine does not support"
+ " regex separators (separators > 1 char and"
+ r" different from '\s+' are"
+ " interpreted as regex)"
+ )
+ engine = "python"
elif delim_whitespace:
- if 'python' in engine:
- result['delimiter'] = r'\s+'
+ if "python" in engine:
+ result["delimiter"] = r"\s+"
elif sep is not None:
encodeable = True
try:
@@ -975,73 +999,85 @@ def _clean_options(self, options, engine):
encodeable = False
except UnicodeDecodeError:
encodeable = False
- if not encodeable and engine not in ('python', 'python-fwf'):
- fallback_reason = ("the separator encoded in {encoding}"
- " is > 1 char long, and the 'c' engine"
- " does not support such separators"
- .format(encoding=encoding))
- engine = 'python'
-
- quotechar = options['quotechar']
- if (quotechar is not None and
- isinstance(quotechar, (str, bytes))):
- if (len(quotechar) == 1 and ord(quotechar) > 127 and
- engine not in ('python', 'python-fwf')):
- fallback_reason = ("ord(quotechar) > 127, meaning the "
- "quotechar is larger than one byte, "
- "and the 'c' engine does not support "
- "such quotechars")
- engine = 'python'
+ if not encodeable and engine not in ("python", "python-fwf"):
+ fallback_reason = (
+ "the separator encoded in {encoding}"
+ " is > 1 char long, and the 'c' engine"
+ " does not support such separators".format(encoding=encoding)
+ )
+ engine = "python"
+
+ quotechar = options["quotechar"]
+ if quotechar is not None and isinstance(quotechar, (str, bytes)):
+ if (
+ len(quotechar) == 1
+ and ord(quotechar) > 127
+ and engine not in ("python", "python-fwf")
+ ):
+ fallback_reason = (
+ "ord(quotechar) > 127, meaning the "
+ "quotechar is larger than one byte, "
+ "and the 'c' engine does not support "
+ "such quotechars"
+ )
+ engine = "python"
if fallback_reason and engine_specified:
raise ValueError(fallback_reason)
- if engine == 'c':
+ if engine == "c":
for arg in _c_unsupported:
del result[arg]
- if 'python' in engine:
+ if "python" in engine:
for arg in _python_unsupported:
if fallback_reason and result[arg] != _c_parser_defaults[arg]:
- msg = ("Falling back to the 'python' engine because"
- " {reason}, but this causes {option!r} to be"
- " ignored as it is not supported by the 'python'"
- " engine.").format(reason=fallback_reason,
- option=arg)
+ msg = (
+ "Falling back to the 'python' engine because"
+ " {reason}, but this causes {option!r} to be"
+ " ignored as it is not supported by the 'python'"
+ " engine."
+ ).format(reason=fallback_reason, option=arg)
raise ValueError(msg)
del result[arg]
if fallback_reason:
- warnings.warn(("Falling back to the 'python' engine because"
- " {0}; you can avoid this warning by specifying"
- " engine='python'.").format(fallback_reason),
- ParserWarning, stacklevel=5)
+ warnings.warn(
+ (
+ "Falling back to the 'python' engine because"
+ " {0}; you can avoid this warning by specifying"
+ " engine='python'."
+ ).format(fallback_reason),
+ ParserWarning,
+ stacklevel=5,
+ )
- index_col = options['index_col']
- names = options['names']
- converters = options['converters']
- na_values = options['na_values']
- skiprows = options['skiprows']
+ index_col = options["index_col"]
+ names = options["names"]
+ converters = options["converters"]
+ na_values = options["na_values"]
+ skiprows = options["skiprows"]
- _validate_header_arg(options['header'])
+ _validate_header_arg(options["header"])
- depr_warning = ''
+ depr_warning = ""
for arg in _deprecated_args:
parser_default = _c_parser_defaults[arg]
depr_default = _deprecated_defaults[arg]
- msg = ("The '{arg}' argument has been deprecated "
- "and will be removed in a future version."
- .format(arg=arg))
+ msg = (
+ "The '{arg}' argument has been deprecated "
+ "and will be removed in a future version.".format(arg=arg)
+ )
if result.get(arg, depr_default) != depr_default:
# raise Exception(result.get(arg, depr_default), depr_default)
- depr_warning += msg + '\n\n'
+ depr_warning += msg + "\n\n"
else:
result[arg] = parser_default
- if depr_warning != '':
+ if depr_warning != "":
warnings.warn(depr_warning, FutureWarning, stacklevel=2)
if index_col is True:
@@ -1049,26 +1085,28 @@ def _clean_options(self, options, engine):
if _is_index_col(index_col):
if not isinstance(index_col, (list, tuple, np.ndarray)):
index_col = [index_col]
- result['index_col'] = index_col
+ result["index_col"] = index_col
names = list(names) if names is not None else names
# type conversion-related
if converters is not None:
if not isinstance(converters, dict):
- raise TypeError('Type converters must be a dict or'
- ' subclass, input was '
- 'a {0!r}'.format(type(converters).__name__))
+ raise TypeError(
+ "Type converters must be a dict or"
+ " subclass, input was "
+ "a {0!r}".format(type(converters).__name__)
+ )
else:
converters = {}
# Converting values to NA
- keep_default_na = options['keep_default_na']
+ keep_default_na = options["keep_default_na"]
na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
# handle skiprows; this is internally handled by the
# c-engine, so only need for python parsers
- if engine != 'c':
+ if engine != "c":
if is_integer(skiprows):
skiprows = list(range(skiprows))
if skiprows is None:
@@ -1077,11 +1115,11 @@ def _clean_options(self, options, engine):
skiprows = set(skiprows)
# put stuff back
- result['names'] = names
- result['converters'] = converters
- result['na_values'] = na_values
- result['na_fvalues'] = na_fvalues
- result['skiprows'] = skiprows
+ result["names"] = names
+ result["converters"] = converters
+ result["na_values"] = na_values
+ result["na_fvalues"] = na_fvalues
+ result["skiprows"] = skiprows
return result, engine
@@ -1092,25 +1130,27 @@ def __next__(self):
self.close()
raise
- def _make_engine(self, engine='c'):
- if engine == 'c':
+ def _make_engine(self, engine="c"):
+ if engine == "c":
self._engine = CParserWrapper(self.f, **self.options)
else:
- if engine == 'python':
+ if engine == "python":
klass = PythonParser
- elif engine == 'python-fwf':
+ elif engine == "python-fwf":
klass = FixedWidthFieldParser
else:
- raise ValueError('Unknown engine: {engine} (valid options are'
- ' "c", "python", or' ' "python-fwf")'.format(
- engine=engine))
+ raise ValueError(
+ "Unknown engine: {engine} (valid options are"
+ ' "c", "python", or'
+ ' "python-fwf")'.format(engine=engine)
+ )
self._engine = klass(self.f, **self.options)
def _failover_to_python(self):
raise AbstractMethodError(self)
def read(self, nrows=None):
- nrows = _validate_integer('nrows', nrows)
+ nrows = _validate_integer("nrows", nrows)
ret = self._engine.read(nrows)
# May alter columns / col_dict
@@ -1166,8 +1206,11 @@ def _is_potential_multi_index(columns):
-------
boolean : Whether or not columns could become a MultiIndex
"""
- return (len(columns) and not isinstance(columns, MultiIndex) and
- all(isinstance(c, tuple) for c in columns))
+ return (
+ len(columns)
+ and not isinstance(columns, MultiIndex)
+ and all(isinstance(c, tuple) for c in columns)
+ )
def _evaluate_usecols(usecols, names):
@@ -1271,8 +1314,10 @@ def _validate_usecols_arg(usecols):
'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
is passed in or None if a callable or None is passed in.
"""
- msg = ("'usecols' must either be list-like of all strings, all unicode, "
- "all integers or a callable.")
+ msg = (
+ "'usecols' must either be list-like of all strings, all unicode, "
+ "all integers or a callable."
+ )
if usecols is not None:
if callable(usecols):
return usecols, None
@@ -1285,8 +1330,7 @@ def _validate_usecols_arg(usecols):
usecols_dtype = lib.infer_dtype(usecols, skipna=False)
- if usecols_dtype not in ("empty", "integer",
- "string", "unicode"):
+ if usecols_dtype not in ("empty", "integer", "string", "unicode"):
raise ValueError(msg)
usecols = set(usecols)
@@ -1301,9 +1345,11 @@ def _validate_parse_dates_arg(parse_dates):
is a non-boolean scalar. Raises a ValueError if
that is the case.
"""
- msg = ("Only booleans, lists, and "
- "dictionaries are accepted "
- "for the 'parse_dates' parameter")
+ msg = (
+ "Only booleans, lists, and "
+ "dictionaries are accepted "
+ "for the 'parse_dates' parameter"
+ )
if parse_dates is not None:
if is_scalar(parse_dates):
@@ -1317,62 +1363,65 @@ def _validate_parse_dates_arg(parse_dates):
class ParserBase:
-
def __init__(self, kwds):
- self.names = kwds.get('names')
+ self.names = kwds.get("names")
self.orig_names = None
- self.prefix = kwds.pop('prefix', None)
+ self.prefix = kwds.pop("prefix", None)
- self.index_col = kwds.get('index_col', None)
+ self.index_col = kwds.get("index_col", None)
self.unnamed_cols = set()
self.index_names = None
self.col_names = None
- self.parse_dates = _validate_parse_dates_arg(
- kwds.pop('parse_dates', False))
- self.date_parser = kwds.pop('date_parser', None)
- self.dayfirst = kwds.pop('dayfirst', False)
- self.keep_date_col = kwds.pop('keep_date_col', False)
+ self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
+ self.date_parser = kwds.pop("date_parser", None)
+ self.dayfirst = kwds.pop("dayfirst", False)
+ self.keep_date_col = kwds.pop("keep_date_col", False)
- self.na_values = kwds.get('na_values')
- self.na_fvalues = kwds.get('na_fvalues')
- self.na_filter = kwds.get('na_filter', False)
- self.keep_default_na = kwds.get('keep_default_na', True)
+ self.na_values = kwds.get("na_values")
+ self.na_fvalues = kwds.get("na_fvalues")
+ self.na_filter = kwds.get("na_filter", False)
+ self.keep_default_na = kwds.get("keep_default_na", True)
- self.true_values = kwds.get('true_values')
- self.false_values = kwds.get('false_values')
- self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True)
- self.infer_datetime_format = kwds.pop('infer_datetime_format', False)
- self.cache_dates = kwds.pop('cache_dates', True)
+ self.true_values = kwds.get("true_values")
+ self.false_values = kwds.get("false_values")
+ self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
+ self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
+ self.cache_dates = kwds.pop("cache_dates", True)
self._date_conv = _make_date_converter(
date_parser=self.date_parser,
dayfirst=self.dayfirst,
infer_datetime_format=self.infer_datetime_format,
- cache_dates=self.cache_dates
+ cache_dates=self.cache_dates,
)
# validate header options for mi
- self.header = kwds.get('header')
+ self.header = kwds.get("header")
if isinstance(self.header, (list, tuple, np.ndarray)):
if not all(map(is_integer, self.header)):
raise ValueError("header must be integer or list of integers")
- if kwds.get('usecols'):
- raise ValueError("cannot specify usecols when "
- "specifying a multi-index header")
- if kwds.get('names'):
- raise ValueError("cannot specify names when "
- "specifying a multi-index header")
+ if kwds.get("usecols"):
+ raise ValueError(
+ "cannot specify usecols when " "specifying a multi-index header"
+ )
+ if kwds.get("names"):
+ raise ValueError(
+ "cannot specify names when " "specifying a multi-index header"
+ )
# validate index_col that only contains integers
if self.index_col is not None:
- is_sequence = isinstance(self.index_col, (list, tuple,
- np.ndarray))
- if not (is_sequence and
- all(map(is_integer, self.index_col)) or
- is_integer(self.index_col)):
- raise ValueError("index_col must only contain row numbers "
- "when specifying a multi-index header")
+ is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray))
+ if not (
+ is_sequence
+ and all(map(is_integer, self.index_col))
+ or is_integer(self.index_col)
+ ):
+ raise ValueError(
+ "index_col must only contain row numbers "
+ "when specifying a multi-index header"
+ )
# GH 16338
elif self.header is not None and not is_integer(self.header):
@@ -1392,10 +1441,11 @@ def close(self):
@property
def _has_complex_date_col(self):
- return (isinstance(self.parse_dates, dict) or
- (isinstance(self.parse_dates, list) and
- len(self.parse_dates) > 0 and
- isinstance(self.parse_dates[0], list)))
+ return isinstance(self.parse_dates, dict) or (
+ isinstance(self.parse_dates, list)
+ and len(self.parse_dates) > 0
+ and isinstance(self.parse_dates[0], list)
+ )
def _should_parse_dates(self, i):
if isinstance(self.parse_dates, bool):
@@ -1408,14 +1458,17 @@ def _should_parse_dates(self, i):
j = self.index_col[i]
if is_scalar(self.parse_dates):
- return ((j == self.parse_dates) or
- (name is not None and name == self.parse_dates))
+ return (j == self.parse_dates) or (
+ name is not None and name == self.parse_dates
+ )
else:
- return ((j in self.parse_dates) or
- (name is not None and name in self.parse_dates))
+ return (j in self.parse_dates) or (
+ name is not None and name in self.parse_dates
+ )
- def _extract_multi_indexer_columns(self, header, index_names, col_names,
- passed_names=False):
+ def _extract_multi_indexer_columns(
+ self, header, index_names, col_names, passed_names=False
+ ):
""" extract and return the names, index_names, col_names
header is a list-of-lists returned from the parsers """
if len(header) < 2:
@@ -1434,9 +1487,9 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names,
# clean the index_names
index_names = header.pop(-1)
- index_names, names, index_col = _clean_index_names(index_names,
- self.index_col,
- self.unnamed_cols)
+ index_names, names, index_col = _clean_index_names(
+ index_names, self.index_col, self.unnamed_cols
+ )
# extract the columns
field_count = len(header[0])
@@ -1453,15 +1506,17 @@ def extract(r):
if all(ensure_str(col[n]) in self.unnamed_cols for col in columns):
raise ParserError(
"Passed header=[{header}] are too many rows for this "
- "multi_index of columns"
- .format(header=','.join(str(x) for x in self.header))
+ "multi_index of columns".format(
+ header=",".join(str(x) for x in self.header)
+ )
)
# Clean the column names (if we have an index_col).
if len(ic):
- col_names = [r[0] if (len(r[0]) and
- r[0] not in self.unnamed_cols) else None
- for r in header]
+ col_names = [
+ r[0] if (len(r[0]) and r[0] not in self.unnamed_cols) else None
+ for r in header
+ ]
else:
col_names = [None] * len(header)
@@ -1487,11 +1542,11 @@ def _maybe_dedup_names(self, names):
counts[col] = cur_count + 1
if is_potential_mi:
- col = col[:-1] + ('{column}.{count}'.format(
- column=col[-1], count=cur_count),)
+ col = col[:-1] + (
+ "{column}.{count}".format(column=col[-1], count=cur_count),
+ )
else:
- col = '{column}.{count}'.format(
- column=col, count=cur_count)
+ col = "{column}.{count}".format(column=col, count=cur_count)
cur_count = counts[col]
names[i] = col
@@ -1514,10 +1569,9 @@ def _make_index(self, data, alldata, columns, indexnamerow=False):
index = self._agg_index(index)
elif self._has_complex_date_col:
if not self._name_processed:
- (self.index_names, _,
- self.index_col) = _clean_index_names(list(columns),
- self.index_col,
- self.unnamed_cols)
+ (self.index_names, _, self.index_col) = _clean_index_names(
+ list(columns), self.index_col, self.unnamed_cols
+ )
self._name_processed = True
index = self._get_complex_date_index(data, columns)
index = self._agg_index(index, try_parse_dates=False)
@@ -1538,7 +1592,7 @@ def _get_simple_index(self, data, columns):
def ix(col):
if not isinstance(col, str):
return col
- raise ValueError('Index {col} invalid'.format(col=col))
+ raise ValueError("Index {col} invalid".format(col=col))
to_remove = []
index = []
@@ -1562,8 +1616,11 @@ def _get_name(icol):
return icol
if col_names is None:
- raise ValueError(('Must supply column order to use {icol!s} '
- 'as index').format(icol=icol))
+ raise ValueError(
+ ("Must supply column order to use {icol!s} " "as index").format(
+ icol=icol
+ )
+ )
for i, c in enumerate(col_names):
if i == icol:
@@ -1603,8 +1660,8 @@ def _agg_index(self, index, try_parse_dates=True):
col_name = self.index_names[i]
if col_name is not None:
col_na_values, col_na_fvalues = _get_na_values(
- col_name, self.na_values, self.na_fvalues,
- self.keep_default_na)
+ col_name, self.na_values, self.na_fvalues, self.keep_default_na
+ )
arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
arrays.append(arr)
@@ -1614,8 +1671,9 @@ def _agg_index(self, index, try_parse_dates=True):
return index
- def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
- converters=None, dtypes=None):
+ def _convert_to_ndarrays(
+ self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None
+ ):
result = {}
for c, values in dct.items():
conv_f = None if converters is None else converters.get(c, None)
@@ -1627,50 +1685,61 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
if self.na_filter:
col_na_values, col_na_fvalues = _get_na_values(
- c, na_values, na_fvalues, self.keep_default_na)
+ c, na_values, na_fvalues, self.keep_default_na
+ )
else:
col_na_values, col_na_fvalues = set(), set()
if conv_f is not None:
# conv_f applied to data before inference
if cast_type is not None:
- warnings.warn(("Both a converter and dtype were specified "
- "for column {0} - only the converter will "
- "be used").format(c), ParserWarning,
- stacklevel=7)
+ warnings.warn(
+ (
+ "Both a converter and dtype were specified "
+ "for column {0} - only the converter will "
+ "be used"
+ ).format(c),
+ ParserWarning,
+ stacklevel=7,
+ )
try:
values = lib.map_infer(values, conv_f)
except ValueError:
- mask = algorithms.isin(
- values, list(na_values)).view(np.uint8)
+ mask = algorithms.isin(values, list(na_values)).view(np.uint8)
values = lib.map_infer_mask(values, conv_f, mask)
cvals, na_count = self._infer_types(
- values, set(col_na_values) | col_na_fvalues,
- try_num_bool=False)
+ values, set(col_na_values) | col_na_fvalues, try_num_bool=False
+ )
else:
- is_str_or_ea_dtype = (is_string_dtype(cast_type)
- or is_extension_array_dtype(cast_type))
+ is_str_or_ea_dtype = is_string_dtype(
+ cast_type
+ ) or is_extension_array_dtype(cast_type)
# skip inference if specified dtype is object
# or casting to an EA
try_num_bool = not (cast_type and is_str_or_ea_dtype)
# general type inference and conversion
cvals, na_count = self._infer_types(
- values, set(col_na_values) | col_na_fvalues,
- try_num_bool)
+ values, set(col_na_values) | col_na_fvalues, try_num_bool
+ )
# type specified in dtype param or cast_type is an EA
- if cast_type and (not is_dtype_equal(cvals, cast_type)
- or is_extension_array_dtype(cast_type)):
+ if cast_type and (
+ not is_dtype_equal(cvals, cast_type)
+ or is_extension_array_dtype(cast_type)
+ ):
try:
- if (is_bool_dtype(cast_type) and
- not is_categorical_dtype(cast_type)
- and na_count > 0):
- raise ValueError("Bool column has NA values in "
- "column {column}"
- .format(column=c))
+ if (
+ is_bool_dtype(cast_type)
+ and not is_categorical_dtype(cast_type)
+ and na_count > 0
+ ):
+ raise ValueError(
+ "Bool column has NA values in "
+ "column {column}".format(column=c)
+ )
except (AttributeError, TypeError):
# invalid input to is_bool_dtype
pass
@@ -1678,8 +1747,11 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
result[c] = cvals
if verbose and na_count:
- print('Filled {count} NA values in column {c!s}'.format(
- count=na_count, c=c))
+ print(
+ "Filled {count} NA values in column {c!s}".format(
+ count=na_count, c=c
+ )
+ )
return result
def _infer_types(self, values, na_values, try_num_bool=True):
@@ -1715,17 +1787,18 @@ def _infer_types(self, values, na_values, try_num_bool=True):
except Exception:
result = values
if values.dtype == np.object_:
- na_count = parsers.sanitize_objects(result,
- na_values, False)
+ na_count = parsers.sanitize_objects(result, na_values, False)
else:
result = values
if values.dtype == np.object_:
na_count = parsers.sanitize_objects(values, na_values, False)
if result.dtype == np.object_ and try_num_bool:
- result = libops.maybe_convert_bool(np.asarray(values),
- true_values=self.true_values,
- false_values=self.false_values)
+ result = libops.maybe_convert_bool(
+ np.asarray(values),
+ true_values=self.true_values,
+ false_values=self.false_values,
+ )
return result, na_count
@@ -1747,8 +1820,10 @@ def _cast_types(self, values, cast_type, column):
"""
if is_categorical_dtype(cast_type):
- known_cats = (isinstance(cast_type, CategoricalDtype) and
- cast_type.categories is not None)
+ known_cats = (
+ isinstance(cast_type, CategoricalDtype)
+ and cast_type.categories is not None
+ )
if not is_object_dtype(values) and not known_cats:
# XXX this is for consistency with
@@ -1758,8 +1833,8 @@ def _cast_types(self, values, cast_type, column):
cats = Index(values).unique().dropna()
values = Categorical._from_inferred_categories(
- cats, cats.get_indexer(values), cast_type,
- true_values=self.true_values)
+ cats, cats.get_indexer(values), cast_type, true_values=self.true_values
+ )
# use the EA's implementation of casting
elif is_extension_array_dtype(cast_type):
@@ -1767,23 +1842,22 @@ def _cast_types(self, values, cast_type, column):
cast_type = pandas_dtype(cast_type)
array_type = cast_type.construct_array_type()
try:
- return array_type._from_sequence_of_strings(values,
- dtype=cast_type)
+ return array_type._from_sequence_of_strings(values, dtype=cast_type)
except NotImplementedError:
raise NotImplementedError(
"Extension Array: {ea} must implement "
"_from_sequence_of_strings in order "
- "to be used in parser methods".format(ea=array_type))
+ "to be used in parser methods".format(ea=array_type)
+ )
else:
try:
- values = astype_nansafe(values, cast_type,
- copy=True, skipna=True)
+ values = astype_nansafe(values, cast_type, copy=True, skipna=True)
except ValueError:
raise ValueError(
"Unable to convert column {column} to type "
- "{cast_type}".format(
- column=column, cast_type=cast_type))
+ "{cast_type}".format(column=column, cast_type=cast_type)
+ )
return values
def _do_date_conversions(self, names, data):
@@ -1791,8 +1865,14 @@ def _do_date_conversions(self, names, data):
if self.parse_dates is not None:
data, names = _process_date_conversion(
- data, self._date_conv, self.parse_dates, self.index_col,
- self.index_names, names, keep_date_col=self.keep_date_col)
+ data,
+ self._date_conv,
+ self.parse_dates,
+ self.index_col,
+ self.index_names,
+ names,
+ keep_date_col=self.keep_date_col,
+ )
return names, data
@@ -1808,22 +1888,20 @@ def __init__(self, src, **kwds):
ParserBase.__init__(self, kwds)
- if (kwds.get('compression') is None
- and 'utf-16' in (kwds.get('encoding') or '')):
+ if kwds.get("compression") is None and "utf-16" in (kwds.get("encoding") or ""):
# if source is utf-16 plain text, convert source to utf-8
if isinstance(src, str):
- src = open(src, 'rb')
+ src = open(src, "rb")
self.handles.append(src)
- src = UTF8Recoder(src, kwds['encoding'])
- kwds['encoding'] = 'utf-8'
+ src = UTF8Recoder(src, kwds["encoding"])
+ kwds["encoding"] = "utf-8"
# #2442
- kwds['allow_leading_cols'] = self.index_col is not False
+ kwds["allow_leading_cols"] = self.index_col is not False
# GH20529, validate usecol arg before TextReader
- self.usecols, self.usecols_dtype = _validate_usecols_arg(
- kwds['usecols'])
- kwds['usecols'] = self.usecols
+ self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
+ kwds["usecols"] = self.usecols
self._reader = parsers.TextReader(src, **kwds)
self.unnamed_cols = self._reader.unnamed_cols
@@ -1835,19 +1913,18 @@ def __init__(self, src, **kwds):
else:
if len(self._reader.header) > 1:
# we have a multi index in the columns
- self.names, self.index_names, self.col_names, passed_names = (
- self._extract_multi_indexer_columns(
- self._reader.header, self.index_names, self.col_names,
- passed_names
- )
+ self.names, self.index_names, self.col_names, passed_names = self._extract_multi_indexer_columns(
+ self._reader.header, self.index_names, self.col_names, passed_names
)
else:
self.names = list(self._reader.header[0])
if self.names is None:
if self.prefix:
- self.names = ['{prefix}{i}'.format(prefix=self.prefix, i=i)
- for i in range(self._reader.table_width)]
+ self.names = [
+ "{prefix}{i}".format(prefix=self.prefix, i=i)
+ for i in range(self._reader.table_width)
+ ]
else:
self.names = list(range(self._reader.table_width))
@@ -1865,19 +1942,23 @@ def __init__(self, src, **kwds):
usecols = _evaluate_usecols(self.usecols, self.orig_names)
# GH 14671
- if (self.usecols_dtype == 'string' and
- not set(usecols).issubset(self.orig_names)):
+ if self.usecols_dtype == "string" and not set(usecols).issubset(
+ self.orig_names
+ ):
_validate_usecols_names(usecols, self.orig_names)
# GH 25623
# validate that column indices in usecols are not out of bounds
- elif self.usecols_dtype == 'integer':
+ elif self.usecols_dtype == "integer":
indices = range(self._reader.table_width)
_validate_usecols_names(usecols, indices)
if len(self.names) > len(usecols):
- self.names = [n for i, n in enumerate(self.names)
- if (i in usecols or n in usecols)]
+ self.names = [
+ n
+ for i, n in enumerate(self.names)
+ if (i in usecols or n in usecols)
+ ]
if len(self.names) < len(usecols):
_validate_usecols_names(usecols, self.names)
@@ -1887,14 +1968,12 @@ def __init__(self, src, **kwds):
self.orig_names = self.names
if not self._has_complex_date_col:
- if (self._reader.leading_cols == 0 and
- _is_index_col(self.index_col)):
+ if self._reader.leading_cols == 0 and _is_index_col(self.index_col):
self._name_processed = True
- (index_names, self.names,
- self.index_col) = _clean_index_names(self.names,
- self.index_col,
- self.unnamed_cols)
+ (index_names, self.names, self.index_col) = _clean_index_names(
+ self.names, self.index_col, self.unnamed_cols
+ )
if self.index_names is None:
self.index_names = index_names
@@ -1922,13 +2001,12 @@ def _set_noconvert_columns(self):
undergo such conversions.
"""
names = self.orig_names
- if self.usecols_dtype == 'integer':
+ if self.usecols_dtype == "integer":
# A set of integers will be converted to a list in
# the correct order every single time.
usecols = list(self.usecols)
usecols.sort()
- elif (callable(self.usecols) or
- self.usecols_dtype not in ('empty', None)):
+ elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
# The names attribute should have the correct columns
# in the proper order for indexing with parse_dates.
usecols = self.names[:]
@@ -1979,16 +2057,19 @@ def read(self, nrows=None):
self._first_chunk = False
names = self._maybe_dedup_names(self.orig_names)
index, columns, col_dict = _get_empty_meta(
- names, self.index_col, self.index_names,
- dtype=self.kwds.get('dtype'))
- columns = self._maybe_make_multi_index_columns(
- columns, self.col_names)
+ names,
+ self.index_col,
+ self.index_names,
+ dtype=self.kwds.get("dtype"),
+ )
+ columns = self._maybe_make_multi_index_columns(columns, self.col_names)
if self.usecols is not None:
columns = self._filter_usecols(columns)
- col_dict = dict(filter(lambda item: item[0] in columns,
- col_dict.items()))
+ col_dict = dict(
+ filter(lambda item: item[0] in columns, col_dict.items())
+ )
return index, columns, col_dict
@@ -2002,7 +2083,7 @@ def read(self, nrows=None):
if self._reader.leading_cols:
if self._has_complex_date_col:
- raise NotImplementedError('file structure not yet supported')
+ raise NotImplementedError("file structure not yet supported")
# implicit index, no index names
arrays = []
@@ -2013,8 +2094,7 @@ def read(self, nrows=None):
else:
values = data.pop(self.index_col[i])
- values = self._maybe_parse_dates(values, i,
- try_parse_dates=True)
+ values = self._maybe_parse_dates(values, i, try_parse_dates=True)
arrays.append(values)
index = ensure_index_from_sequences(arrays)
@@ -2058,8 +2138,9 @@ def _filter_usecols(self, names):
# hackish
usecols = _evaluate_usecols(self.usecols, names)
if usecols is not None and len(names) != len(usecols):
- names = [name for i, name in enumerate(names)
- if i in usecols or name in usecols]
+ names = [
+ name for i, name in enumerate(names) if i in usecols or name in usecols
+ ]
return names
def _get_index_names(self):
@@ -2067,9 +2148,9 @@ def _get_index_names(self):
idx_names = None
if self._reader.leading_cols == 0 and self.index_col is not None:
- (idx_names, names,
- self.index_col) = _clean_index_names(names, self.index_col,
- self.unnamed_cols)
+ (idx_names, names, self.index_col) = _clean_index_names(
+ names, self.index_col, self.unnamed_cols
+ )
return names, idx_names
@@ -2133,16 +2214,15 @@ def TextParser(*args, **kwds):
'high' for the high-precision converter, and 'round_trip' for the
round-trip converter.
"""
- kwds['engine'] = 'python'
+ kwds["engine"] = "python"
return TextFileReader(*args, **kwds)
def count_empty_vals(vals):
- return sum(1 for v in vals if v == '' or v is None)
+ return sum(1 for v in vals if v == "" or v is None)
class PythonParser(ParserBase):
-
def __init__(self, f, **kwds):
"""
Workhorse function for processing nested list into DataFrame
@@ -2156,58 +2236,61 @@ def __init__(self, f, **kwds):
self.pos = 0
self.line_pos = 0
- self.encoding = kwds['encoding']
- self.compression = kwds['compression']
- self.memory_map = kwds['memory_map']
- self.skiprows = kwds['skiprows']
+ self.encoding = kwds["encoding"]
+ self.compression = kwds["compression"]
+ self.memory_map = kwds["memory_map"]
+ self.skiprows = kwds["skiprows"]
if callable(self.skiprows):
self.skipfunc = self.skiprows
else:
self.skipfunc = lambda x: x in self.skiprows
- self.skipfooter = _validate_skipfooter_arg(kwds['skipfooter'])
- self.delimiter = kwds['delimiter']
+ self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"])
+ self.delimiter = kwds["delimiter"]
- self.quotechar = kwds['quotechar']
+ self.quotechar = kwds["quotechar"]
if isinstance(self.quotechar, str):
self.quotechar = str(self.quotechar)
- self.escapechar = kwds['escapechar']
- self.doublequote = kwds['doublequote']
- self.skipinitialspace = kwds['skipinitialspace']
- self.lineterminator = kwds['lineterminator']
- self.quoting = kwds['quoting']
- self.usecols, self.usecols_dtype = _validate_usecols_arg(
- kwds['usecols'])
- self.skip_blank_lines = kwds['skip_blank_lines']
+ self.escapechar = kwds["escapechar"]
+ self.doublequote = kwds["doublequote"]
+ self.skipinitialspace = kwds["skipinitialspace"]
+ self.lineterminator = kwds["lineterminator"]
+ self.quoting = kwds["quoting"]
+ self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
+ self.skip_blank_lines = kwds["skip_blank_lines"]
- self.warn_bad_lines = kwds['warn_bad_lines']
- self.error_bad_lines = kwds['error_bad_lines']
+ self.warn_bad_lines = kwds["warn_bad_lines"]
+ self.error_bad_lines = kwds["error_bad_lines"]
- self.names_passed = kwds['names'] or None
+ self.names_passed = kwds["names"] or None
self.has_index_names = False
- if 'has_index_names' in kwds:
- self.has_index_names = kwds['has_index_names']
+ if "has_index_names" in kwds:
+ self.has_index_names = kwds["has_index_names"]
- self.verbose = kwds['verbose']
- self.converters = kwds['converters']
+ self.verbose = kwds["verbose"]
+ self.converters = kwds["converters"]
- self.dtype = kwds['dtype']
- self.thousands = kwds['thousands']
- self.decimal = kwds['decimal']
+ self.dtype = kwds["dtype"]
+ self.thousands = kwds["thousands"]
+ self.decimal = kwds["decimal"]
- self.comment = kwds['comment']
+ self.comment = kwds["comment"]
self._comment_lines = []
- f, handles = _get_handle(f, 'r', encoding=self.encoding,
- compression=self.compression,
- memory_map=self.memory_map)
+ f, handles = _get_handle(
+ f,
+ "r",
+ encoding=self.encoding,
+ compression=self.compression,
+ memory_map=self.memory_map,
+ )
self.handles.extend(handles)
# Set self.data to something that can read lines.
- if hasattr(f, 'readline'):
+ if hasattr(f, "readline"):
self._make_reader(f)
else:
self.data = f
@@ -2215,17 +2298,18 @@ def __init__(self, f, **kwds):
# Get columns in two steps: infer from data, then
# infer column indices from self.usecols if it is specified.
self._col_indices = None
- (self.columns, self.num_original_columns,
- self.unnamed_cols) = self._infer_columns()
+ (
+ self.columns,
+ self.num_original_columns,
+ self.unnamed_cols,
+ ) = self._infer_columns()
# Now self.columns has the set of columns that we will process.
# The original set is stored in self.original_columns.
if len(self.columns) > 1:
# we are processing a multi index column
- self.columns, self.index_names, self.col_names, _ = (
- self._extract_multi_indexer_columns(
- self.columns, self.index_names, self.col_names
- )
+ self.columns, self.index_names, self.col_names, _ = self._extract_multi_indexer_columns(
+ self.columns, self.index_names, self.col_names
)
# Update list of original names to include all indices.
self.num_original_columns = len(self.columns)
@@ -2239,8 +2323,9 @@ def __init__(self, f, **kwds):
# multiple date column thing turning into a real spaghetti factory
if not self._has_complex_date_col:
- (index_names, self.orig_names, self.columns) = (
- self._get_index_name(self.columns))
+ (index_names, self.orig_names, self.columns) = self._get_index_name(
+ self.columns
+ )
self._name_processed = True
if self.index_names is None:
self.index_names = index_names
@@ -2251,14 +2336,18 @@ def __init__(self, f, **kwds):
self._no_thousands_columns = None
if len(self.decimal) != 1:
- raise ValueError('Only length-1 decimal markers supported')
+ raise ValueError("Only length-1 decimal markers supported")
if self.thousands is None:
self.nonnum = re.compile(
- r'[^-^0-9^{decimal}]+'.format(decimal=self.decimal))
+ r"[^-^0-9^{decimal}]+".format(decimal=self.decimal)
+ )
else:
- self.nonnum = re.compile(r'[^-^0-9^{thousands}^{decimal}]+'.format(
- thousands=self.thousands, decimal=self.decimal))
+ self.nonnum = re.compile(
+ r"[^-^0-9^{thousands}^{decimal}]+".format(
+ thousands=self.thousands, decimal=self.decimal
+ )
+ )
def _set_no_thousands_columns(self):
# Create a set of column ids that are not to be stripped of thousands
@@ -2301,8 +2390,9 @@ def _make_reader(self, f):
if sep is None or len(sep) == 1:
if self.lineterminator:
- raise ValueError('Custom line terminators not supported in '
- 'python parser (yet)')
+ raise ValueError(
+ "Custom line terminators not supported in " "python parser (yet)"
+ )
class MyDialect(csv.Dialect):
delimiter = self.delimiter
@@ -2311,7 +2401,7 @@ class MyDialect(csv.Dialect):
doublequote = self.doublequote
skipinitialspace = self.skipinitialspace
quoting = self.quoting
- lineterminator = '\n'
+ lineterminator = "\n"
dia = MyDialect
@@ -2334,23 +2424,25 @@ class MyDialect(csv.Dialect):
sniffed = csv.Sniffer().sniff(line)
dia.delimiter = sniffed.delimiter
if self.encoding is not None:
- self.buf.extend(list(
- UnicodeReader(StringIO(line),
- dialect=dia,
- encoding=self.encoding)))
+ self.buf.extend(
+ list(
+ UnicodeReader(
+ StringIO(line), dialect=dia, encoding=self.encoding
+ )
+ )
+ )
else:
- self.buf.extend(list(csv.reader(StringIO(line),
- dialect=dia)))
+ self.buf.extend(list(csv.reader(StringIO(line), dialect=dia)))
if self.encoding is not None:
- reader = UnicodeReader(f, dialect=dia,
- encoding=self.encoding,
- strict=True)
+ reader = UnicodeReader(
+ f, dialect=dia, encoding=self.encoding, strict=True
+ )
else:
- reader = csv.reader(f, dialect=dia,
- strict=True)
+ reader = csv.reader(f, dialect=dia, strict=True)
else:
+
def _read():
line = f.readline()
pat = re.compile(sep)
@@ -2359,6 +2451,7 @@ def _read():
for line in f:
yield pat.split(line.strip())
+
reader = _read()
self.data = reader
@@ -2380,9 +2473,9 @@ def read(self, rows=None):
# DataFrame with the right metadata, even though it's length 0
names = self._maybe_dedup_names(self.orig_names)
index, columns, col_dict = _get_empty_meta(
- names, self.index_col, self.index_names, self.dtype)
- columns = self._maybe_make_multi_index_columns(
- columns, self.col_names)
+ names, self.index_col, self.index_names, self.dtype
+ )
+ columns = self._maybe_make_multi_index_columns(columns, self.col_names)
return index, columns, col_dict
# handle new style for names in index
@@ -2462,9 +2555,14 @@ def _clean_mapping(mapping):
clean_na_values = self.na_values
clean_na_fvalues = self.na_fvalues
- return self._convert_to_ndarrays(data, clean_na_values,
- clean_na_fvalues, self.verbose,
- clean_conv, clean_dtypes)
+ return self._convert_to_ndarrays(
+ data,
+ clean_na_values,
+ clean_na_fvalues,
+ self.verbose,
+ clean_conv,
+ clean_dtypes,
+ )
def _infer_columns(self):
names = self.names
@@ -2495,8 +2593,9 @@ def _infer_columns(self):
except StopIteration:
if self.line_pos < hr:
raise ValueError(
- 'Passed header={hr} but only {pos} lines in '
- 'file'.format(hr=hr, pos=(self.line_pos + 1)))
+ "Passed header={hr} but only {pos} lines in "
+ "file".format(hr=hr, pos=(self.line_pos + 1))
+ )
# We have an empty file, so check
# if columns are provided. That will
@@ -2508,8 +2607,7 @@ def _infer_columns(self):
return columns, num_original_columns, unnamed_cols
if not self.names:
- raise EmptyDataError(
- "No columns to parse from file")
+ raise EmptyDataError("No columns to parse from file")
line = self.names[:]
@@ -2517,10 +2615,11 @@ def _infer_columns(self):
this_unnamed_cols = []
for i, c in enumerate(line):
- if c == '':
+ if c == "":
if have_mi_columns:
- col_name = ("Unnamed: {i}_level_{level}"
- .format(i=i, level=level))
+ col_name = "Unnamed: {i}_level_{level}".format(
+ i=i, level=level
+ )
else:
col_name = "Unnamed: {i}".format(i=i)
@@ -2537,8 +2636,7 @@ def _infer_columns(self):
while cur_count > 0:
counts[col] = cur_count + 1
- col = '{column}.{count}'.format(
- column=col, count=cur_count)
+ col = "{column}.{count}".format(column=col, count=cur_count)
cur_count = counts[col]
this_columns[i] = col
@@ -2550,8 +2648,7 @@ def _infer_columns(self):
# line for the rest of the parsing code
if hr == header[-1]:
lc = len(this_columns)
- ic = (len(self.index_col)
- if self.index_col is not None else 0)
+ ic = len(self.index_col) if self.index_col is not None else 0
unnamed_count = len(this_unnamed_cols)
if lc != unnamed_count and lc - ic > unnamed_count:
@@ -2560,8 +2657,7 @@ def _infer_columns(self):
self.buf = [self.buf[-1]]
columns.append(this_columns)
- unnamed_cols.update({this_columns[i]
- for i in this_unnamed_cols})
+ unnamed_cols.update({this_columns[i] for i in this_unnamed_cols})
if len(columns) == 1:
num_original_columns = len(this_columns)
@@ -2571,21 +2667,21 @@ def _infer_columns(self):
# GH 25623
# validate that column indices in usecols are not out of bounds
- if self.usecols_dtype == 'integer':
+ if self.usecols_dtype == "integer":
for col in columns:
indices = range(len(col))
_validate_usecols_names(self.usecols, indices)
if names is not None:
- if ((self.usecols is not None and
- len(names) != len(self.usecols)) or
- (self.usecols is None and
- len(names) != len(columns[0]))):
- raise ValueError('Number of passed names did not match '
- 'number of header fields in the file')
+ if (self.usecols is not None and len(names) != len(self.usecols)) or (
+ self.usecols is None and len(names) != len(columns[0])
+ ):
+ raise ValueError(
+ "Number of passed names did not match "
+ "number of header fields in the file"
+ )
if len(columns) > 1:
- raise TypeError('Cannot pass names with multi-index '
- 'columns')
+ raise TypeError("Cannot pass names with multi-index " "columns")
if self.usecols is not None:
# Set _use_cols. We don't store columns because they are
@@ -2603,8 +2699,7 @@ def _infer_columns(self):
except StopIteration:
if not names:
- raise EmptyDataError(
- "No columns to parse from file")
+ raise EmptyDataError("No columns to parse from file")
line = names[:]
@@ -2613,13 +2708,17 @@ def _infer_columns(self):
# GH 25623
# validate that column indices in usecols are not out of bounds
- if self.usecols_dtype == 'integer':
+ if self.usecols_dtype == "integer":
_validate_usecols_names(self.usecols, range(ncols))
if not names:
if self.prefix:
- columns = [['{prefix}{idx}'.format(
- prefix=self.prefix, idx=i) for i in range(ncols)]]
+ columns = [
+ [
+ "{prefix}{idx}".format(prefix=self.prefix, idx=i)
+ for i in range(ncols)
+ ]
+ ]
else:
columns = [list(range(ncols))]
columns = self._handle_usecols(columns, columns[0])
@@ -2628,11 +2727,10 @@ def _infer_columns(self):
columns = self._handle_usecols([names], names)
num_original_columns = len(names)
else:
- if (not callable(self.usecols) and
- len(names) != len(self.usecols)):
+ if not callable(self.usecols) and len(names) != len(self.usecols):
raise ValueError(
- 'Number of passed names did not match number of '
- 'header fields in the file'
+ "Number of passed names did not match number of "
+ "header fields in the file"
)
# Ignore output but set used columns.
self._handle_usecols([names], names)
@@ -2652,8 +2750,9 @@ def _handle_usecols(self, columns, usecols_key):
col_indices = _evaluate_usecols(self.usecols, usecols_key)
elif any(isinstance(u, str) for u in self.usecols):
if len(columns) > 1:
- raise ValueError("If using multiple headers, usecols must "
- "be integers.")
+ raise ValueError(
+ "If using multiple headers, usecols must " "be integers."
+ )
col_indices = []
for col in self.usecols:
@@ -2667,8 +2766,10 @@ def _handle_usecols(self, columns, usecols_key):
else:
col_indices = self.usecols
- columns = [[n for i, n in enumerate(column) if i in col_indices]
- for column in columns]
+ columns = [
+ [n for i, n in enumerate(column) if i in col_indices]
+ for column in columns
+ ]
self._col_indices = col_indices
return columns
@@ -2724,7 +2825,7 @@ def _check_for_bom(self, first_row):
# Extract any remaining data after the second
# quotation mark.
if len(first_row_bom) > end + 1:
- new_row += first_row_bom[end + 1:]
+ new_row += first_row_bom[end + 1 :]
return [new_row] + first_row[1:]
elif len(first_row_bom) > 1:
@@ -2759,9 +2860,9 @@ def _next_line(self):
line = self._check_comments([self.data[self.pos]])[0]
self.pos += 1
# either uncommented or blank to begin with
- if (not self.skip_blank_lines and
- (self._is_line_empty(
- self.data[self.pos - 1]) or line)):
+ if not self.skip_blank_lines and (
+ self._is_line_empty(self.data[self.pos - 1]) or line
+ ):
break
elif self.skip_blank_lines:
ret = self._remove_empty_lines([line])
@@ -2819,8 +2920,8 @@ def _alert_malformed(self, msg, row_num):
if self.error_bad_lines:
raise ParserError(msg)
elif self.warn_bad_lines:
- base = 'Skipping line {row_num}: '.format(row_num=row_num)
- sys.stderr.write(base + msg + '\n')
+ base = "Skipping line {row_num}: ".format(row_num=row_num)
+ sys.stderr.write(base + msg + "\n")
def _next_iter_line(self, row_num):
"""
@@ -2841,19 +2942,23 @@ def _next_iter_line(self, row_num):
if self.warn_bad_lines or self.error_bad_lines:
msg = str(e)
- if 'NULL byte' in msg:
- msg = ('NULL byte detected. This byte '
- 'cannot be processed in Python\'s '
- 'native csv library at the moment, '
- 'so please pass in engine=\'c\' instead')
+ if "NULL byte" in msg:
+ msg = (
+ "NULL byte detected. This byte "
+ "cannot be processed in Python's "
+ "native csv library at the moment, "
+ "so please pass in engine='c' instead"
+ )
if self.skipfooter > 0:
- reason = ('Error could possibly be due to '
- 'parsing errors in the skipped footer rows '
- '(the skipfooter keyword is only applied '
- 'after Python\'s csv library has parsed '
- 'all rows).')
- msg += '. ' + reason
+ reason = (
+ "Error could possibly be due to "
+ "parsing errors in the skipped footer rows "
+ "(the skipfooter keyword is only applied "
+ "after Python's csv library has parsed "
+ "all rows)."
+ )
+ msg += ". " + reason
self._alert_malformed(msg, row_num)
return None
@@ -2865,11 +2970,10 @@ def _check_comments(self, lines):
for l in lines:
rl = []
for x in l:
- if (not isinstance(x, str) or
- self.comment not in x):
+ if not isinstance(x, str) or self.comment not in x:
rl.append(x)
else:
- x = x[:x.find(self.comment)]
+ x = x[: x.find(self.comment)]
if len(x) > 0:
rl.append(x)
break
@@ -2895,8 +2999,11 @@ def _remove_empty_lines(self, lines):
ret = []
for l in lines:
# Remove empty lines and lines with only one whitespace value
- if (len(l) > 1 or len(l) == 1 and
- (not isinstance(l[0], str) or l[0].strip())):
+ if (
+ len(l) > 1
+ or len(l) == 1
+ and (not isinstance(l[0], str) or l[0].strip())
+ ):
ret.append(l)
return ret
@@ -2904,20 +3011,21 @@ def _check_thousands(self, lines):
if self.thousands is None:
return lines
- return self._search_replace_num_columns(lines=lines,
- search=self.thousands,
- replace='')
+ return self._search_replace_num_columns(
+ lines=lines, search=self.thousands, replace=""
+ )
def _search_replace_num_columns(self, lines, search, replace):
ret = []
for l in lines:
rl = []
for i, x in enumerate(l):
- if (not isinstance(x, str) or
- search not in x or
- (self._no_thousands_columns and
- i in self._no_thousands_columns) or
- self.nonnum.search(x.strip())):
+ if (
+ not isinstance(x, str)
+ or search not in x
+ or (self._no_thousands_columns and i in self._no_thousands_columns)
+ or self.nonnum.search(x.strip())
+ ):
rl.append(x)
else:
rl.append(x.replace(search, replace))
@@ -2925,12 +3033,12 @@ def _search_replace_num_columns(self, lines, search, replace):
return ret
def _check_decimal(self, lines):
- if self.decimal == _parser_defaults['decimal']:
+ if self.decimal == _parser_defaults["decimal"]:
return lines
- return self._search_replace_num_columns(lines=lines,
- search=self.decimal,
- replace='.')
+ return self._search_replace_num_columns(
+ lines=lines, search=self.decimal, replace="."
+ )
def _clear_buffer(self):
self.buf = []
@@ -2995,9 +3103,9 @@ def _get_index_name(self, columns):
else:
# Case 2
- (index_name, columns_,
- self.index_col) = _clean_index_names(columns, self.index_col,
- self.unnamed_cols)
+ (index_name, columns_, self.index_col) = _clean_index_names(
+ columns, self.index_col, self.unnamed_cols
+ )
return index_name, orig_names, columns
@@ -3012,9 +3120,7 @@ def _rows_to_cols(self, content):
# Check that there are no rows with too many
# elements in their row (rows with too few
# elements are padded with NaN).
- if (max_len > col_len and
- self.index_col is not False and
- self.usecols is None):
+ if max_len > col_len and self.index_col is not False and self.usecols is None:
footers = self.skipfooter if self.skipfooter else 0
bad_lines = []
@@ -3037,32 +3143,43 @@ def _rows_to_cols(self, content):
content.append(l)
for row_num, actual_len in bad_lines:
- msg = ('Expected {col_len} fields in line {line}, saw '
- '{length}'.format(col_len=col_len, line=(row_num + 1),
- length=actual_len))
- if (self.delimiter and
- len(self.delimiter) > 1 and
- self.quoting != csv.QUOTE_NONE):
+ msg = (
+ "Expected {col_len} fields in line {line}, saw "
+ "{length}".format(
+ col_len=col_len, line=(row_num + 1), length=actual_len
+ )
+ )
+ if (
+ self.delimiter
+ and len(self.delimiter) > 1
+ and self.quoting != csv.QUOTE_NONE
+ ):
# see gh-13374
- reason = ('Error could possibly be due to quotes being '
- 'ignored when a multi-char delimiter is used.')
- msg += '. ' + reason
+ reason = (
+ "Error could possibly be due to quotes being "
+ "ignored when a multi-char delimiter is used."
+ )
+ msg += ". " + reason
self._alert_malformed(msg, row_num + 1)
# see gh-13320
- zipped_content = list(lib.to_object_array(
- content, min_width=col_len).T)
+ zipped_content = list(lib.to_object_array(content, min_width=col_len).T)
if self.usecols:
if self._implicit_index:
zipped_content = [
- a for i, a in enumerate(zipped_content)
- if (i < len(self.index_col) or
- i - len(self.index_col) in self._col_indices)]
+ a
+ for i, a in enumerate(zipped_content)
+ if (
+ i < len(self.index_col)
+ or i - len(self.index_col) in self._col_indices
+ )
+ ]
else:
- zipped_content = [a for i, a in enumerate(zipped_content)
- if i in self._col_indices]
+ zipped_content = [
+ a for i, a in enumerate(zipped_content) if i in self._col_indices
+ ]
return zipped_content
def _get_lines(self, rows=None):
@@ -3084,16 +3201,19 @@ def _get_lines(self, rows=None):
if self.pos > len(self.data):
raise StopIteration
if rows is None:
- new_rows = self.data[self.pos:]
+ new_rows = self.data[self.pos :]
new_pos = len(self.data)
else:
- new_rows = self.data[self.pos:self.pos + rows]
+ new_rows = self.data[self.pos : self.pos + rows]
new_pos = self.pos + rows
# Check for stop rows. n.b.: self.skiprows is a set.
if self.skiprows:
- new_rows = [row for i, row in enumerate(new_rows)
- if not self.skipfunc(i + self.pos)]
+ new_rows = [
+ row
+ for i, row in enumerate(new_rows)
+ if not self.skipfunc(i + self.pos)
+ ]
lines.extend(new_rows)
self.pos = new_pos
@@ -3109,8 +3229,7 @@ def _get_lines(self, rows=None):
rows = 0
while True:
- new_row = self._next_iter_line(
- row_num=self.pos + rows + 1)
+ new_row = self._next_iter_line(row_num=self.pos + rows + 1)
rows += 1
if new_row is not None:
@@ -3118,8 +3237,11 @@ def _get_lines(self, rows=None):
except StopIteration:
if self.skiprows:
- new_rows = [row for i, row in enumerate(new_rows)
- if not self.skipfunc(i + self.pos)]
+ new_rows = [
+ row
+ for i, row in enumerate(new_rows)
+ if not self.skipfunc(i + self.pos)
+ ]
lines.extend(new_rows)
if len(lines) == 0:
raise
@@ -3130,7 +3252,7 @@ def _get_lines(self, rows=None):
lines = new_rows
if self.skipfooter:
- lines = lines[:-self.skipfooter]
+ lines = lines[: -self.skipfooter]
lines = self._check_comments(lines)
if self.skip_blank_lines:
@@ -3139,8 +3261,9 @@ def _get_lines(self, rows=None):
return self._check_decimal(lines)
-def _make_date_converter(date_parser=None, dayfirst=False,
- infer_datetime_format=False, cache_dates=True):
+def _make_date_converter(
+ date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True
+):
def converter(*date_cols):
if date_parser is None:
strs = parsing._concat_date_cols(date_cols)
@@ -3150,25 +3273,22 @@ def converter(*date_cols):
ensure_object(strs),
utc=None,
dayfirst=dayfirst,
- errors='ignore',
+ errors="ignore",
infer_datetime_format=infer_datetime_format,
- cache=cache_dates
+ cache=cache_dates,
).to_numpy()
except ValueError:
return tools.to_datetime(
- parsing.try_parse_dates(strs, dayfirst=dayfirst),
- cache=cache_dates
+ parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates
)
else:
try:
result = tools.to_datetime(
- date_parser(*date_cols),
- errors='ignore',
- cache=cache_dates
+ date_parser(*date_cols), errors="ignore", cache=cache_dates
)
if isinstance(result, datetime.datetime):
- raise Exception('scalar parser')
+ raise Exception("scalar parser")
return result
except Exception:
try:
@@ -3176,22 +3296,29 @@ def converter(*date_cols):
parsing.try_parse_dates(
parsing._concat_date_cols(date_cols),
parser=date_parser,
- dayfirst=dayfirst),
- errors='ignore')
+ dayfirst=dayfirst,
+ ),
+ errors="ignore",
+ )
except Exception:
return generic_parser(date_parser, *date_cols)
return converter
-def _process_date_conversion(data_dict, converter, parse_spec,
- index_col, index_names, columns,
- keep_date_col=False):
+def _process_date_conversion(
+ data_dict,
+ converter,
+ parse_spec,
+ index_col,
+ index_names,
+ columns,
+ keep_date_col=False,
+):
def _isindex(colspec):
- return ((isinstance(index_col, list) and
- colspec in index_col) or
- (isinstance(index_names, list) and
- colspec in index_names))
+ return (isinstance(index_col, list) and colspec in index_col) or (
+ isinstance(index_names, list) and colspec in index_names
+ )
new_cols = []
new_data = {}
@@ -3215,11 +3342,12 @@ def _isindex(colspec):
data_dict[colspec] = converter(data_dict[colspec])
else:
new_name, col, old_names = _try_convert_dates(
- converter, colspec, data_dict, orig_names)
+ converter, colspec, data_dict, orig_names
+ )
if new_name in data_dict:
raise ValueError(
- 'New date column already in dict {name}'.format(
- name=new_name))
+ "New date column already in dict {name}".format(name=new_name)
+ )
new_data[new_name] = col
new_cols.append(new_name)
date_cols.update(old_names)
@@ -3229,10 +3357,12 @@ def _isindex(colspec):
for new_name, colspec in parse_spec.items():
if new_name in data_dict:
raise ValueError(
- 'Date column {name} already in dict'.format(name=new_name))
+ "Date column {name} already in dict".format(name=new_name)
+ )
- _, col, old_names = _try_convert_dates(converter, colspec,
- data_dict, orig_names)
+ _, col, old_names = _try_convert_dates(
+ converter, colspec, data_dict, orig_names
+ )
new_data[new_name] = col
new_cols.append(new_name)
@@ -3261,7 +3391,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
else:
colnames.append(c)
- new_name = '_'.join(str(x) for x in colnames)
+ new_name = "_".join(str(x) for x in colnames)
to_parse = [data_dict[c] for c in colnames if c in data_dict]
new_col = parser(*to_parse)
@@ -3377,8 +3507,7 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None):
for i, n in enumerate(index_col):
columns.pop(n - i)
- col_dict = {col_name: Series([], dtype=dtype[col_name])
- for col_name in columns}
+ col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns}
return index, columns, col_dict
@@ -3473,29 +3602,35 @@ class FixedWidthReader(BaseIterator):
A reader of fixed-width lines.
"""
- def __init__(self, f, colspecs, delimiter, comment, skiprows=None,
- infer_nrows=100):
+ def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100):
self.f = f
self.buffer = None
- self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
+ self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
self.comment = comment
- if colspecs == 'infer':
- self.colspecs = self.detect_colspecs(infer_nrows=infer_nrows,
- skiprows=skiprows)
+ if colspecs == "infer":
+ self.colspecs = self.detect_colspecs(
+ infer_nrows=infer_nrows, skiprows=skiprows
+ )
else:
self.colspecs = colspecs
if not isinstance(self.colspecs, (tuple, list)):
- raise TypeError("column specifications must be a list or tuple, "
- "input was a %r" % type(colspecs).__name__)
+ raise TypeError(
+ "column specifications must be a list or tuple, "
+ "input was a %r" % type(colspecs).__name__
+ )
for colspec in self.colspecs:
- if not (isinstance(colspec, (tuple, list)) and
- len(colspec) == 2 and
- isinstance(colspec[0], (int, np.integer, type(None))) and
- isinstance(colspec[1], (int, np.integer, type(None)))):
- raise TypeError('Each column specification must be '
- '2 element tuple or list of integers')
+ if not (
+ isinstance(colspec, (tuple, list))
+ and len(colspec) == 2
+ and isinstance(colspec[0], (int, np.integer, type(None)))
+ and isinstance(colspec[1], (int, np.integer, type(None)))
+ ):
+ raise TypeError(
+ "Each column specification must be "
+ "2 element tuple or list of integers"
+ )
def get_rows(self, infer_nrows, skiprows=None):
"""
@@ -3537,8 +3672,8 @@ def get_rows(self, infer_nrows, skiprows=None):
def detect_colspecs(self, infer_nrows=100, skiprows=None):
# Regex escape the delimiters
- delimiters = ''.join(r'\{}'.format(x) for x in self.delimiter)
- pattern = re.compile('([^{}]+)'.format(delimiters))
+ delimiters = "".join(r"\{}".format(x) for x in self.delimiter)
+ pattern = re.compile("([^{}]+)".format(delimiters))
rows = self.get_rows(infer_nrows, skiprows)
if not rows:
raise EmptyDataError("No rows from which to infer column width")
@@ -3548,7 +3683,7 @@ def detect_colspecs(self, infer_nrows=100, skiprows=None):
rows = [row.partition(self.comment)[0] for row in rows]
for row in rows:
for m in pattern.finditer(row):
- mask[m.start():m.end()] = 1
+ mask[m.start() : m.end()] = 1
shifted = np.roll(mask, 1)
shifted[0] = 0
edges = np.where((mask ^ shifted) == 1)[0]
@@ -3565,8 +3700,7 @@ def __next__(self):
else:
line = next(self.f)
# Note: 'colspecs' is a sequence of half-open intervals.
- return [line[fromm:to].strip(self.delimiter)
- for (fromm, to) in self.colspecs]
+ return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs]
class FixedWidthFieldParser(PythonParser):
@@ -3577,11 +3711,16 @@ class FixedWidthFieldParser(PythonParser):
def __init__(self, f, **kwds):
# Support iterators, convert to a list.
- self.colspecs = kwds.pop('colspecs')
- self.infer_nrows = kwds.pop('infer_nrows')
+ self.colspecs = kwds.pop("colspecs")
+ self.infer_nrows = kwds.pop("infer_nrows")
PythonParser.__init__(self, f, **kwds)
def _make_reader(self, f):
- self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
- self.comment, self.skiprows,
- self.infer_nrows)
+ self.data = FixedWidthReader(
+ f,
+ self.colspecs,
+ self.delimiter,
+ self.comment,
+ self.skiprows,
+ self.infer_nrows,
+ )
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
index afe1622d99eac..4e390de87fc60 100644
--- a/pandas/io/pickle.py
+++ b/pandas/io/pickle.py
@@ -10,8 +10,7 @@
from pandas.io.common import _get_handle, _stringify_path
-def to_pickle(obj, path, compression='infer',
- protocol=pickle.HIGHEST_PROTOCOL):
+def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL):
"""
Pickle (serialize) object to file.
@@ -70,9 +69,7 @@ def to_pickle(obj, path, compression='infer',
>>> os.remove("./dummy.pkl")
"""
path = _stringify_path(path)
- f, fh = _get_handle(path, 'wb',
- compression=compression,
- is_text=False)
+ f, fh = _get_handle(path, "wb", compression=compression, is_text=False)
if protocol < 0:
protocol = pickle.HIGHEST_PROTOCOL
try:
@@ -83,7 +80,7 @@ def to_pickle(obj, path, compression='infer',
_f.close()
-def read_pickle(path, compression='infer'):
+def read_pickle(path, compression="infer"):
"""
Load pickled pandas object (or any object) from file.
@@ -145,7 +142,7 @@ def read_pickle(path, compression='infer'):
>>> os.remove("./dummy.pkl")
"""
path = _stringify_path(path)
- f, fh = _get_handle(path, 'rb', compression=compression, is_text=False)
+ f, fh = _get_handle(path, "rb", compression=compression, is_text=False)
# 1) try standard libary Pickle
# 2) try pickle_compat (older pandas version) to handle subclass changes
@@ -160,12 +157,13 @@ def read_pickle(path, compression='infer'):
try:
return pc.load(f, encoding=None)
except Exception: # noqa: E722
- return pc.load(f, encoding='latin1')
+ return pc.load(f, encoding="latin1")
finally:
f.close()
for _f in fh:
_f.close()
+
# compat with sparse pickle / unpickle
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index f439e365fbcf0..9206463e18fb3 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -22,15 +22,31 @@
from pandas.errors import PerformanceWarning
from pandas.core.dtypes.common import (
- ensure_object, is_categorical_dtype, is_datetime64_dtype,
- is_datetime64tz_dtype, is_extension_type, is_list_like,
- is_timedelta64_dtype)
+ ensure_object,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+ is_datetime64tz_dtype,
+ is_extension_type,
+ is_list_like,
+ is_timedelta64_dtype,
+)
from pandas.core.dtypes.missing import array_equivalent
from pandas import (
- DataFrame, DatetimeIndex, Index, Int64Index, MultiIndex, PeriodIndex,
- Series, SparseDataFrame, SparseSeries, TimedeltaIndex, concat, isna,
- to_datetime)
+ DataFrame,
+ DatetimeIndex,
+ Index,
+ Int64Index,
+ MultiIndex,
+ PeriodIndex,
+ Series,
+ SparseDataFrame,
+ SparseSeries,
+ TimedeltaIndex,
+ concat,
+ isna,
+ to_datetime,
+)
from pandas.core.arrays.categorical import Categorical
from pandas.core.arrays.sparse import BlockIndex, IntIndex
import pandas.core.common as com
@@ -42,16 +58,16 @@
from pandas.io.formats.printing import adjoin, pprint_thing
# versioning attribute
-_version = '0.15.2'
+_version = "0.15.2"
# encoding
-_default_encoding = 'UTF-8'
+_default_encoding = "UTF-8"
def _ensure_decoded(s):
""" if we have bytes, decode them to unicode """
if isinstance(s, np.bytes_):
- s = s.decode('UTF-8')
+ s = s.decode("UTF-8")
return s
@@ -145,12 +161,7 @@ class DuplicateWarning(Warning):
"""
# formats
-_FORMAT_MAP = {
- 'f': 'fixed',
- 'fixed': 'fixed',
- 't': 'table',
- 'table': 'table',
-}
+_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
format_deprecate_doc = """
the table keyword has been deprecated
@@ -163,38 +174,35 @@ class DuplicateWarning(Warning):
# map object types
_TYPE_MAP = {
-
- Series: 'series',
- SparseSeries: 'sparse_series',
- DataFrame: 'frame',
- SparseDataFrame: 'sparse_frame',
+ Series: "series",
+ SparseSeries: "sparse_series",
+ DataFrame: "frame",
+ SparseDataFrame: "sparse_frame",
}
# storer class map
_STORER_MAP = {
- 'Series': 'LegacySeriesFixed',
- 'DataFrame': 'LegacyFrameFixed',
- 'DataMatrix': 'LegacyFrameFixed',
- 'series': 'SeriesFixed',
- 'sparse_series': 'SparseSeriesFixed',
- 'frame': 'FrameFixed',
- 'sparse_frame': 'SparseFrameFixed',
+ "Series": "LegacySeriesFixed",
+ "DataFrame": "LegacyFrameFixed",
+ "DataMatrix": "LegacyFrameFixed",
+ "series": "SeriesFixed",
+ "sparse_series": "SparseSeriesFixed",
+ "frame": "FrameFixed",
+ "sparse_frame": "SparseFrameFixed",
}
# table class map
_TABLE_MAP = {
- 'generic_table': 'GenericTable',
- 'appendable_series': 'AppendableSeriesTable',
- 'appendable_multiseries': 'AppendableMultiSeriesTable',
- 'appendable_frame': 'AppendableFrameTable',
- 'appendable_multiframe': 'AppendableMultiFrameTable',
- 'worm': 'WORMTable',
+ "generic_table": "GenericTable",
+ "appendable_series": "AppendableSeriesTable",
+ "appendable_multiseries": "AppendableMultiSeriesTable",
+ "appendable_frame": "AppendableFrameTable",
+ "appendable_multiframe": "AppendableMultiFrameTable",
+ "worm": "WORMTable",
}
# axes map
-_AXES_MAP = {
- DataFrame: [0],
-}
+_AXES_MAP = {DataFrame: [0]}
# register our configuration options
dropna_doc = """
@@ -207,12 +215,13 @@ class DuplicateWarning(Warning):
put will default to 'fixed' and append will default to 'table'
"""
-with config.config_prefix('io.hdf'):
- config.register_option('dropna_table', False, dropna_doc,
- validator=config.is_bool)
+with config.config_prefix("io.hdf"):
+ config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
config.register_option(
- 'default_format', None, format_doc,
- validator=config.is_one_of_factory(['fixed', 'table', None])
+ "default_format",
+ None,
+ format_doc,
+ validator=config.is_one_of_factory(["fixed", "table", None]),
)
# oh the troubles to reduce import time
@@ -225,6 +234,7 @@ def _tables():
global _table_file_open_policy_is_strict
if _table_mod is None:
import tables
+
_table_mod = tables
# set the file open policy
@@ -232,17 +242,27 @@ def _tables():
# depending on the HDF5 version
try:
_table_file_open_policy_is_strict = (
- tables.file._FILE_OPEN_POLICY == 'strict')
+ tables.file._FILE_OPEN_POLICY == "strict"
+ )
except AttributeError:
pass
return _table_mod
+
# interface to/from ###
-def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,
- append=None, **kwargs):
+def to_hdf(
+ path_or_buf,
+ key,
+ value,
+ mode=None,
+ complevel=None,
+ complib=None,
+ append=None,
+ **kwargs
+):
""" store this object, close it if we opened it """
if append:
@@ -252,14 +272,15 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,
path_or_buf = _stringify_path(path_or_buf)
if isinstance(path_or_buf, str):
- with HDFStore(path_or_buf, mode=mode, complevel=complevel,
- complib=complib) as store:
+ with HDFStore(
+ path_or_buf, mode=mode, complevel=complevel, complib=complib
+ ) as store:
f(store)
else:
f(path_or_buf)
-def read_hdf(path_or_buf, key=None, mode='r', **kwargs):
+def read_hdf(path_or_buf, key=None, mode="r", **kwargs):
"""
Read from the store, close it if we opened it.
@@ -319,24 +340,27 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs):
>>> reread = pd.read_hdf('./store.h5')
"""
- if mode not in ['r', 'r+', 'a']:
- raise ValueError('mode {0} is not allowed while performing a read. '
- 'Allowed modes are r, r+ and a.'.format(mode))
+ if mode not in ["r", "r+", "a"]:
+ raise ValueError(
+ "mode {0} is not allowed while performing a read. "
+ "Allowed modes are r, r+ and a.".format(mode)
+ )
# grab the scope
- if 'where' in kwargs:
- kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1)
+ if "where" in kwargs:
+ kwargs["where"] = _ensure_term(kwargs["where"], scope_level=1)
if isinstance(path_or_buf, HDFStore):
if not path_or_buf.is_open:
- raise IOError('The HDFStore must be open for reading.')
+ raise IOError("The HDFStore must be open for reading.")
store = path_or_buf
auto_close = False
else:
path_or_buf = _stringify_path(path_or_buf)
if not isinstance(path_or_buf, str):
- raise NotImplementedError('Support for generic buffers has not '
- 'been implemented.')
+ raise NotImplementedError(
+ "Support for generic buffers has not " "been implemented."
+ )
try:
exists = os.path.exists(path_or_buf)
@@ -346,7 +370,8 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs):
if not exists:
raise FileNotFoundError(
- 'File {path} does not exist'.format(path=path_or_buf))
+ "File {path} does not exist".format(path=path_or_buf)
+ )
store = HDFStore(path_or_buf, mode=mode, **kwargs)
# can't auto open/close if we are using an iterator
@@ -357,7 +382,7 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs):
if key is None:
groups = store.groups()
if len(groups) == 0:
- raise ValueError('No dataset in HDF5 file.')
+ raise ValueError("No dataset in HDF5 file.")
candidate_only_group = groups[0]
# For the HDF file to have only one dataset, all other groups
@@ -366,8 +391,10 @@ def read_hdf(path_or_buf, key=None, mode='r', **kwargs):
# before their children.)
for group_to_check in groups[1:]:
if not _is_metadata_of(group_to_check, candidate_only_group):
- raise ValueError('key must be provided when HDF5 file '
- 'contains multiple datasets.')
+ raise ValueError(
+ "key must be provided when HDF5 file "
+ "contains multiple datasets."
+ )
key = candidate_only_group._v_pathname
return store.select(key, auto_close=auto_close, **kwargs)
except (ValueError, TypeError, KeyError):
@@ -388,7 +415,7 @@ def _is_metadata_of(group, parent_group):
current = group
while current._v_depth > 1:
parent = current._v_parent
- if parent == parent_group and current._v_name == 'meta':
+ if parent == parent_group and current._v_name == "meta":
return True
current = current._v_parent
return False
@@ -439,25 +466,28 @@ class HDFStore:
>>> store.close()
"""
- def __init__(self, path, mode=None, complevel=None, complib=None,
- fletcher32=False, **kwargs):
+ def __init__(
+ self, path, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs
+ ):
- if 'format' in kwargs:
- raise ValueError('format is not a defined argument for HDFStore')
+ if "format" in kwargs:
+ raise ValueError("format is not a defined argument for HDFStore")
tables = import_optional_dependency("tables")
if complib is not None and complib not in tables.filters.all_complibs:
raise ValueError(
"complib only supports {libs} compression.".format(
- libs=tables.filters.all_complibs))
+ libs=tables.filters.all_complibs
+ )
+ )
if complib is None and complevel is not None:
complib = tables.filters.default_complib
self._path = _stringify_path(path)
if mode is None:
- mode = 'a'
+ mode = "a"
self._mode = mode
self._handle = None
self._complevel = complevel if complevel else 0
@@ -496,7 +526,9 @@ def __getattr__(self, name):
pass
raise AttributeError(
"'{object}' object has no attribute '{name}'".format(
- object=type(self).__name__, name=name))
+ object=type(self).__name__, name=name
+ )
+ )
def __contains__(self, key):
""" check for existence of this key
@@ -513,8 +545,9 @@ def __len__(self):
return len(self.groups())
def __repr__(self):
- return '{type}\nFile path: {path}\n'.format(
- type=type(self), path=pprint_thing(self._path))
+ return "{type}\nFile path: {path}\n".format(
+ type=type(self), path=pprint_thing(self._path)
+ )
def __enter__(self):
return self
@@ -546,7 +579,7 @@ def items(self):
iteritems = items
- def open(self, mode='a', **kwargs):
+ def open(self, mode="a", **kwargs):
"""
Open the file in the specified mode
@@ -560,16 +593,15 @@ def open(self, mode='a', **kwargs):
if self._mode != mode:
# if we are changing a write mode to read, ok
- if self._mode in ['a', 'w'] and mode in ['r', 'r+']:
+ if self._mode in ["a", "w"] and mode in ["r", "r+"]:
pass
- elif mode in ['w']:
+ elif mode in ["w"]:
# this would truncate, raise here
if self.is_open:
raise PossibleDataLossError(
"Re-opening the file [{0}] with mode [{1}] "
- "will delete the current file!"
- .format(self._path, self._mode)
+ "will delete the current file!".format(self._path, self._mode)
)
self._mode = mode
@@ -579,16 +611,16 @@ def open(self, mode='a', **kwargs):
self.close()
if self._complevel and self._complevel > 0:
- self._filters = _tables().Filters(self._complevel, self._complib,
- fletcher32=self._fletcher32)
+ self._filters = _tables().Filters(
+ self._complevel, self._complib, fletcher32=self._fletcher32
+ )
try:
self._handle = tables.open_file(self._path, self._mode, **kwargs)
except (IOError) as e: # pragma: no cover
- if 'can not be written' in str(e):
- print(
- 'Opening {path} in read-only mode'.format(path=self._path))
- self._handle = tables.open_file(self._path, 'r', **kwargs)
+ if "can not be written" in str(e):
+ print("Opening {path} in read-only mode".format(path=self._path))
+ self._handle = tables.open_file(self._path, "r", **kwargs)
else:
raise
@@ -596,7 +628,7 @@ def open(self, mode='a', **kwargs):
# trap PyTables >= 3.1 FILE_OPEN_POLICY exception
# to provide an updated message
- if 'FILE_OPEN_POLICY' in str(e):
+ if "FILE_OPEN_POLICY" in str(e):
e = ValueError(
"PyTables [{version}] no longer supports opening multiple "
"files\n"
@@ -605,9 +637,11 @@ def open(self, mode='a', **kwargs):
"and not open the same file multiple times at once,\n"
"upgrade the HDF5 version, or downgrade to PyTables 3.0.0 "
"which allows\n"
- "files to be opened multiple times at once\n"
- .format(version=tables.__version__,
- hdf_version=tables.get_hdf5_version()))
+ "files to be opened multiple times at once\n".format(
+ version=tables.__version__,
+ hdf_version=tables.get_hdf5_version(),
+ )
+ )
raise e
@@ -615,7 +649,7 @@ def open(self, mode='a', **kwargs):
# trying to read from a non-existent file causes an error which
# is not part of IOError, make it one
- if self._mode == 'r' and 'Unable to open/create file' in str(e):
+ if self._mode == "r" and "Unable to open/create file" in str(e):
raise IOError(str(e))
raise
@@ -674,11 +708,21 @@ def get(self, key):
"""
group = self.get_node(key)
if group is None:
- raise KeyError('No object named {key} in the file'.format(key=key))
+ raise KeyError("No object named {key} in the file".format(key=key))
return self._read_group(group)
- def select(self, key, where=None, start=None, stop=None, columns=None,
- iterator=False, chunksize=None, auto_close=False, **kwargs):
+ def select(
+ self,
+ key,
+ where=None,
+ start=None,
+ stop=None,
+ columns=None,
+ iterator=False,
+ chunksize=None,
+ auto_close=False,
+ **kwargs
+ ):
"""
Retrieve pandas object stored in file, optionally based on where
criteria
@@ -702,7 +746,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None,
"""
group = self.get_node(key)
if group is None:
- raise KeyError('No object named {key} in the file'.format(key=key))
+ raise KeyError("No object named {key} in the file".format(key=key))
# create the storer and axes
where = _ensure_term(where, scope_level=1)
@@ -711,19 +755,25 @@ def select(self, key, where=None, start=None, stop=None, columns=None,
# function to call on iteration
def func(_start, _stop, _where):
- return s.read(start=_start, stop=_stop,
- where=_where,
- columns=columns)
+ return s.read(start=_start, stop=_stop, where=_where, columns=columns)
# create the iterator
- it = TableIterator(self, s, func, where=where, nrows=s.nrows,
- start=start, stop=stop, iterator=iterator,
- chunksize=chunksize, auto_close=auto_close)
+ it = TableIterator(
+ self,
+ s,
+ func,
+ where=where,
+ nrows=s.nrows,
+ start=start,
+ stop=stop,
+ iterator=iterator,
+ chunksize=chunksize,
+ auto_close=auto_close,
+ )
return it.get_result()
- def select_as_coordinates(
- self, key, where=None, start=None, stop=None, **kwargs):
+ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs):
"""
return the selection as an Index
@@ -735,8 +785,9 @@ def select_as_coordinates(
stop : integer (defaults to None), row number to stop selection
"""
where = _ensure_term(where, scope_level=1)
- return self.get_storer(key).read_coordinates(where=where, start=start,
- stop=stop, **kwargs)
+ return self.get_storer(key).read_coordinates(
+ where=where, start=start, stop=stop, **kwargs
+ )
def select_column(self, key, column, **kwargs):
"""
@@ -758,9 +809,19 @@ def select_column(self, key, column, **kwargs):
"""
return self.get_storer(key).read_column(column=column, **kwargs)
- def select_as_multiple(self, keys, where=None, selector=None, columns=None,
- start=None, stop=None, iterator=False,
- chunksize=None, auto_close=False, **kwargs):
+ def select_as_multiple(
+ self,
+ keys,
+ where=None,
+ selector=None,
+ columns=None,
+ start=None,
+ stop=None,
+ iterator=False,
+ chunksize=None,
+ auto_close=False,
+ **kwargs
+ ):
""" Retrieve pandas objects from multiple tables
Parameters
@@ -786,9 +847,16 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
if isinstance(keys, (list, tuple)) and len(keys) == 1:
keys = keys[0]
if isinstance(keys, str):
- return self.select(key=keys, where=where, columns=columns,
- start=start, stop=stop, iterator=iterator,
- chunksize=chunksize, **kwargs)
+ return self.select(
+ key=keys,
+ where=where,
+ columns=columns,
+ start=start,
+ stop=stop,
+ iterator=iterator,
+ chunksize=chunksize,
+ **kwargs
+ )
if not isinstance(keys, (list, tuple)):
raise TypeError("keys must be a list/tuple")
@@ -817,8 +885,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
if nrows is None:
nrows = t.nrows
elif t.nrows != nrows:
- raise ValueError(
- "all tables must have exactly the same nrows!")
+ raise ValueError("all tables must have exactly the same nrows!")
# axis is the concentration axes
axis = list({t.non_index_axes[0][0] for t in tbls})[0]
@@ -827,17 +894,29 @@ def func(_start, _stop, _where):
# retrieve the objs, _where is always passed as a set of
# coordinates here
- objs = [t.read(where=_where, columns=columns, start=_start,
- stop=_stop, **kwargs) for t in tbls]
+ objs = [
+ t.read(
+ where=_where, columns=columns, start=_start, stop=_stop, **kwargs
+ )
+ for t in tbls
+ ]
# concat and return
- return concat(objs, axis=axis,
- verify_integrity=False)._consolidate()
+ return concat(objs, axis=axis, verify_integrity=False)._consolidate()
# create the iterator
- it = TableIterator(self, s, func, where=where, nrows=nrows,
- start=start, stop=stop, iterator=iterator,
- chunksize=chunksize, auto_close=auto_close)
+ it = TableIterator(
+ self,
+ s,
+ func,
+ where=where,
+ nrows=nrows,
+ start=start,
+ stop=stop,
+ iterator=iterator,
+ chunksize=chunksize,
+ auto_close=auto_close,
+ )
return it.get_result(coordinates=True)
@@ -867,7 +946,7 @@ def put(self, key, value, format=None, append=False, **kwargs):
the store settable by the option 'io.hdf.dropna_table'
"""
if format is None:
- format = get_option("io.hdf.default_format") or 'fixed'
+ format = get_option("io.hdf.default_format") or "fixed"
kwargs = self._validate_format(format, kwargs)
self._write_to_group(key, value, append=append, **kwargs)
@@ -902,7 +981,8 @@ def remove(self, key, where=None, start=None, stop=None):
if where is not None:
raise ValueError(
- "trying to remove a node with a non-None where clause!")
+ "trying to remove a node with a non-None where clause!"
+ )
# we are actually trying to remove a node (with children)
s = self.get_node(key)
@@ -918,11 +998,13 @@ def remove(self, key, where=None, start=None, stop=None):
else:
if not s.is_table:
raise ValueError(
- 'can only remove with where on objects written as tables')
+ "can only remove with where on objects written as tables"
+ )
return s.delete(where=where, start=start, stop=stop)
- def append(self, key, value, format=None, append=True, columns=None,
- dropna=None, **kwargs):
+ def append(
+ self, key, value, format=None, append=True, columns=None, dropna=None, **kwargs
+ ):
"""
Append to Table in file. Node must already exist and be Table
format.
@@ -957,19 +1039,20 @@ def append(self, key, value, format=None, append=True, columns=None,
data in the table, so be careful
"""
if columns is not None:
- raise TypeError("columns is not a supported keyword in append, "
- "try data_columns")
+ raise TypeError(
+ "columns is not a supported keyword in append, " "try data_columns"
+ )
if dropna is None:
dropna = get_option("io.hdf.dropna_table")
if format is None:
- format = get_option("io.hdf.default_format") or 'table'
+ format = get_option("io.hdf.default_format") or "table"
kwargs = self._validate_format(format, kwargs)
- self._write_to_group(key, value, append=append, dropna=dropna,
- **kwargs)
+ self._write_to_group(key, value, append=append, dropna=dropna, **kwargs)
- def append_to_multiple(self, d, value, selector, data_columns=None,
- axes=None, dropna=False, **kwargs):
+ def append_to_multiple(
+ self, d, value, selector, data_columns=None, axes=None, dropna=False, **kwargs
+ ):
"""
Append to multiple tables
@@ -992,9 +1075,11 @@ def append_to_multiple(self, d, value, selector, data_columns=None,
"""
if axes is not None:
- raise TypeError("axes is currently not accepted as a parameter to"
- " append_to_multiple; you can create the "
- "tables independently instead")
+ raise TypeError(
+ "axes is currently not accepted as a parameter to"
+ " append_to_multiple; you can create the "
+ "tables independently instead"
+ )
if not isinstance(d, dict):
raise ValueError(
@@ -1035,7 +1120,7 @@ def append_to_multiple(self, d, value, selector, data_columns=None,
# ensure rows are synchronized across the tables
if dropna:
- idxs = (value[cols].dropna(how='all').index for cols in d.values())
+ idxs = (value[cols].dropna(how="all").index for cols in d.values())
valid_index = next(idxs)
for index in idxs:
valid_index = valid_index.intersection(index)
@@ -1069,8 +1154,7 @@ def create_table_index(self, key, **kwargs):
return
if not s.is_table:
- raise TypeError(
- "cannot create table index on a Fixed format store")
+ raise TypeError("cannot create table index on a Fixed format store")
s.create_index(**kwargs)
def groups(self):
@@ -1084,12 +1168,16 @@ def groups(self):
_tables()
self._check_if_open()
return [
- g for g in self._handle.walk_groups()
- if (not isinstance(g, _table_mod.link.Link) and
- (getattr(g._v_attrs, 'pandas_type', None) or
- getattr(g, 'table', None) or
- (isinstance(g, _table_mod.table.Table) and
- g._v_name != 'table')))
+ g
+ for g in self._handle.walk_groups()
+ if (
+ not isinstance(g, _table_mod.link.Link)
+ and (
+ getattr(g._v_attrs, "pandas_type", None)
+ or getattr(g, "table", None)
+ or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
+ )
+ )
]
def walk(self, where="/"):
@@ -1123,27 +1211,27 @@ def walk(self, where="/"):
_tables()
self._check_if_open()
for g in self._handle.walk_groups(where):
- if getattr(g._v_attrs, 'pandas_type', None) is not None:
+ if getattr(g._v_attrs, "pandas_type", None) is not None:
continue
groups = []
leaves = []
for child in g._v_children.values():
- pandas_type = getattr(child._v_attrs, 'pandas_type', None)
+ pandas_type = getattr(child._v_attrs, "pandas_type", None)
if pandas_type is None:
if isinstance(child, _table_mod.group.Group):
groups.append(child._v_name)
else:
leaves.append(child._v_name)
- yield (g._v_pathname.rstrip('/'), groups, leaves)
+ yield (g._v_pathname.rstrip("/"), groups, leaves)
def get_node(self, key):
""" return the node with the key or None if it does not exist """
self._check_if_open()
try:
- if not key.startswith('/'):
- key = '/' + key
+ if not key.startswith("/"):
+ key = "/" + key
return self._handle.get_node(self.root, key)
except _table_mod.exceptions.NoSuchNodeError:
return None
@@ -1152,14 +1240,23 @@ def get_storer(self, key):
""" return the storer object for a key, raise if not in the file """
group = self.get_node(key)
if group is None:
- raise KeyError('No object named {key} in the file'.format(key=key))
+ raise KeyError("No object named {key} in the file".format(key=key))
s = self._create_storer(group)
s.infer_axes()
return s
- def copy(self, file, mode='w', propindexes=True, keys=None, complib=None,
- complevel=None, fletcher32=False, overwrite=True):
+ def copy(
+ self,
+ file,
+ mode="w",
+ propindexes=True,
+ keys=None,
+ complib=None,
+ complevel=None,
+ fletcher32=False,
+ overwrite=True,
+ ):
""" copy the existing store to a new file, upgrading in place
Parameters
@@ -1176,11 +1273,8 @@ def copy(self, file, mode='w', propindexes=True, keys=None, complib=None,
"""
new_store = HDFStore(
- file,
- mode=mode,
- complib=complib,
- complevel=complevel,
- fletcher32=fletcher32)
+ file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
+ )
if keys is None:
keys = list(self.keys())
if not isinstance(keys, (tuple, list)):
@@ -1200,9 +1294,11 @@ def copy(self, file, mode='w', propindexes=True, keys=None, complib=None,
if propindexes:
index = [a.name for a in s.axes if a.is_indexed]
new_store.append(
- k, data, index=index,
- data_columns=getattr(s, 'data_columns', None),
- encoding=s.encoding
+ k,
+ data,
+ index=index,
+ data_columns=getattr(s, "data_columns", None),
+ encoding=s.encoding,
)
else:
new_store.put(k, data, encoding=s.encoding)
@@ -1219,8 +1315,9 @@ def info(self):
-------
str
"""
- output = '{type}\nFile path: {path}\n'.format(
- type=type(self), path=pprint_thing(self._path))
+ output = "{type}\nFile path: {path}\n".format(
+ type=type(self), path=pprint_thing(self._path)
+ )
if self.is_open:
lkeys = sorted(list(self.keys()))
if len(lkeys):
@@ -1232,17 +1329,18 @@ def info(self):
s = self.get_storer(k)
if s is not None:
keys.append(pprint_thing(s.pathname or k))
- values.append(
- pprint_thing(s or 'invalid_HDFStore node'))
+ values.append(pprint_thing(s or "invalid_HDFStore node"))
except Exception as detail:
keys.append(k)
values.append(
"[invalid_HDFStore node: {detail}]".format(
- detail=pprint_thing(detail)))
+ detail=pprint_thing(detail)
+ )
+ )
output += adjoin(12, keys, values)
else:
- output += 'Empty'
+ output += "Empty"
else:
output += "File is CLOSED"
@@ -1259,58 +1357,64 @@ def _validate_format(self, format, kwargs):
# validate
try:
- kwargs['format'] = _FORMAT_MAP[format.lower()]
+ kwargs["format"] = _FORMAT_MAP[format.lower()]
except KeyError:
- raise TypeError("invalid HDFStore format specified [{0}]"
- .format(format))
+ raise TypeError("invalid HDFStore format specified [{0}]".format(format))
return kwargs
- def _create_storer(self, group, format=None, value=None, append=False,
- **kwargs):
+ def _create_storer(self, group, format=None, value=None, append=False, **kwargs):
""" return a suitable class to operate """
def error(t):
raise TypeError(
"cannot properly create the storer for: [{t}] [group->"
"{group},value->{value},format->{format},append->{append},"
- "kwargs->{kwargs}]".format(t=t, group=group,
- value=type(value), format=format,
- append=append, kwargs=kwargs))
+ "kwargs->{kwargs}]".format(
+ t=t,
+ group=group,
+ value=type(value),
+ format=format,
+ append=append,
+ kwargs=kwargs,
+ )
+ )
- pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None))
- tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None))
+ pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
+ tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
# infer the pt from the passed value
if pt is None:
if value is None:
_tables()
- if (getattr(group, 'table', None) or
- isinstance(group, _table_mod.table.Table)):
- pt = 'frame_table'
- tt = 'generic_table'
+ if getattr(group, "table", None) or isinstance(
+ group, _table_mod.table.Table
+ ):
+ pt = "frame_table"
+ tt = "generic_table"
else:
raise TypeError(
"cannot create a storer if the object is not existing "
- "nor a value are passed")
+ "nor a value are passed"
+ )
else:
try:
pt = _TYPE_MAP[type(value)]
except KeyError:
- error('_TYPE_MAP')
+ error("_TYPE_MAP")
# we are actually a table
- if format == 'table':
- pt += '_table'
+ if format == "table":
+ pt += "_table"
# a storer node
- if 'table' not in pt:
+ if "table" not in pt:
try:
return globals()[_STORER_MAP[pt]](self, group, **kwargs)
except KeyError:
- error('_STORER_MAP')
+ error("_STORER_MAP")
# existing node (and must be a table)
if tt is None:
@@ -1318,43 +1422,52 @@ def error(t):
# if we are a writer, determine the tt
if value is not None:
- if pt == 'series_table':
- index = getattr(value, 'index', None)
+ if pt == "series_table":
+ index = getattr(value, "index", None)
if index is not None:
if index.nlevels == 1:
- tt = 'appendable_series'
+ tt = "appendable_series"
elif index.nlevels > 1:
- tt = 'appendable_multiseries'
- elif pt == 'frame_table':
- index = getattr(value, 'index', None)
+ tt = "appendable_multiseries"
+ elif pt == "frame_table":
+ index = getattr(value, "index", None)
if index is not None:
if index.nlevels == 1:
- tt = 'appendable_frame'
+ tt = "appendable_frame"
elif index.nlevels > 1:
- tt = 'appendable_multiframe'
- elif pt == 'wide_table':
- tt = 'appendable_panel'
- elif pt == 'ndim_table':
- tt = 'appendable_ndim'
+ tt = "appendable_multiframe"
+ elif pt == "wide_table":
+ tt = "appendable_panel"
+ elif pt == "ndim_table":
+ tt = "appendable_ndim"
else:
# distinguish between a frame/table
- tt = 'legacy_panel'
+ tt = "legacy_panel"
try:
fields = group.table._v_attrs.fields
- if len(fields) == 1 and fields[0] == 'value':
- tt = 'legacy_frame'
+ if len(fields) == 1 and fields[0] == "value":
+ tt = "legacy_frame"
except IndexError:
pass
try:
return globals()[_TABLE_MAP[tt]](self, group, **kwargs)
except KeyError:
- error('_TABLE_MAP')
-
- def _write_to_group(self, key, value, format, index=True, append=False,
- complib=None, encoding=None, **kwargs):
+ error("_TABLE_MAP")
+
+ def _write_to_group(
+ self,
+ key,
+ value,
+ format,
+ index=True,
+ append=False,
+ complib=None,
+ encoding=None,
+ **kwargs
+ ):
group = self.get_node(key)
# remove the node if we are not appending
@@ -1364,43 +1477,41 @@ def _write_to_group(self, key, value, format, index=True, append=False,
# we don't want to store a table node at all if are object is 0-len
# as there are not dtypes
- if getattr(value, 'empty', None) and (format == 'table' or append):
+ if getattr(value, "empty", None) and (format == "table" or append):
return
if group is None:
- paths = key.split('/')
+ paths = key.split("/")
# recursively create the groups
- path = '/'
+ path = "/"
for p in paths:
if not len(p):
continue
new_path = path
- if not path.endswith('/'):
- new_path += '/'
+ if not path.endswith("/"):
+ new_path += "/"
new_path += p
group = self.get_node(new_path)
if group is None:
group = self._handle.create_group(path, p)
path = new_path
- s = self._create_storer(group, format, value, append=append,
- encoding=encoding, **kwargs)
+ s = self._create_storer(
+ group, format, value, append=append, encoding=encoding, **kwargs
+ )
if append:
# raise if we are trying to append to a Fixed format,
# or a table that exists (and we are putting)
- if (not s.is_table or
- (s.is_table and format == 'fixed' and s.is_exists)):
- raise ValueError('Can only append to Tables')
+ if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
+ raise ValueError("Can only append to Tables")
if not s.is_exists:
s.set_object_info()
else:
s.set_object_info()
if not s.is_table and complib:
- raise ValueError(
- 'Compression not supported on Fixed format stores'
- )
+ raise ValueError("Compression not supported on Fixed format stores")
# write the object
s.write(obj=value, append=append, complib=complib, **kwargs)
@@ -1435,8 +1546,19 @@ class TableIterator:
kwargs : the passed kwargs
"""
- def __init__(self, store, s, func, where, nrows, start=None, stop=None,
- iterator=False, chunksize=None, auto_close=False):
+ def __init__(
+ self,
+ store,
+ s,
+ func,
+ where,
+ nrows,
+ start=None,
+ stop=None,
+ iterator=False,
+ chunksize=None,
+ auto_close=False,
+ ):
self.store = store
self.s = s
self.func = func
@@ -1491,8 +1613,7 @@ def get_result(self, coordinates=False):
# return the actual iterator
if self.chunksize is not None:
if not self.s.is_table:
- raise TypeError(
- "can only use an iterator or chunksize on a table")
+ raise TypeError("can only use an iterator or chunksize on a table")
self.coordinates = self.s.read_coordinates(where=self.where)
@@ -1500,8 +1621,9 @@ def get_result(self, coordinates=False):
# if specified read via coordinates (necessary for multiple selections
if coordinates:
- where = self.s.read_coordinates(where=self.where, start=self.start,
- stop=self.stop)
+ where = self.s.read_coordinates(
+ where=self.where, start=self.start, stop=self.stop
+ )
else:
where = self.where
@@ -1525,13 +1647,27 @@ class IndexCol:
pos : the position in the pytables
"""
+
is_an_indexable = True
is_data_indexable = True
- _info_fields = ['freq', 'tz', 'index_name']
-
- def __init__(self, values=None, kind=None, typ=None, cname=None,
- itemsize=None, name=None, axis=None, kind_attr=None,
- pos=None, freq=None, tz=None, index_name=None, **kwargs):
+ _info_fields = ["freq", "tz", "index_name"]
+
+ def __init__(
+ self,
+ values=None,
+ kind=None,
+ typ=None,
+ cname=None,
+ itemsize=None,
+ name=None,
+ axis=None,
+ kind_attr=None,
+ pos=None,
+ freq=None,
+ tz=None,
+ index_name=None,
+ **kwargs
+ ):
self.values = values
self.kind = kind
self.typ = typ
@@ -1581,20 +1717,21 @@ def set_table(self, table):
def __repr__(self):
temp = tuple(
- map(pprint_thing,
- (self.name,
- self.cname,
- self.axis,
- self.pos,
- self.kind)))
- return ','.join(("{key}->{value}".format(key=key, value=value)
- for key, value in zip(
- ['name', 'cname', 'axis', 'pos', 'kind'], temp)))
+ map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
+ )
+ return ",".join(
+ (
+ "{key}->{value}".format(key=key, value=value)
+ for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
+ )
+ )
def __eq__(self, other):
""" compare 2 col items """
- return all(getattr(self, a, None) == getattr(other, a, None)
- for a in ['name', 'cname', 'axis', 'pos'])
+ return all(
+ getattr(self, a, None) == getattr(other, a, None)
+ for a in ["name", "cname", "axis", "pos"]
+ )
def __ne__(self, other):
return not self.__eq__(other)
@@ -1620,8 +1757,7 @@ def infer(self, handler):
new_self.read_metadata(handler)
return new_self
- def convert(self, values, nan_rep, encoding, errors, start=None,
- stop=None):
+ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None):
""" set the values from this selection: take = take ownership """
# values is a recarray
@@ -1632,9 +1768,9 @@ def convert(self, values, nan_rep, encoding, errors, start=None,
kwargs = dict()
if self.freq is not None:
- kwargs['freq'] = _ensure_decoded(self.freq)
+ kwargs["freq"] = _ensure_decoded(self.freq)
if self.index_name is not None:
- kwargs['name'] = _ensure_decoded(self.index_name)
+ kwargs["name"] = _ensure_decoded(self.index_name)
# making an Index instance could throw a number of different errors
try:
self.values = Index(values, **kwargs)
@@ -1642,8 +1778,8 @@ def convert(self, values, nan_rep, encoding, errors, start=None,
# if the output freq is different that what we recorded,
# it should be None (see also 'doc example part 2')
- if 'freq' in kwargs:
- kwargs['freq'] = None
+ if "freq" in kwargs:
+ kwargs["freq"] = None
self.values = Index(values, **kwargs)
self.values = _set_tz(self.values, self.tz)
@@ -1680,14 +1816,13 @@ def maybe_set_size(self, min_itemsize=None):
""" maybe set a string col itemsize:
min_itemsize can be an integer or a dict with this columns name
with an integer size """
- if _ensure_decoded(self.kind) == 'string':
+ if _ensure_decoded(self.kind) == "string":
if isinstance(min_itemsize, dict):
min_itemsize = min_itemsize.get(self.name)
if min_itemsize is not None and self.typ.itemsize < min_itemsize:
- self.typ = _tables(
- ).StringCol(itemsize=min_itemsize, pos=self.pos)
+ self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
def validate(self, handler, append):
self.validate_names()
@@ -1707,7 +1842,7 @@ def validate_col(self, itemsize=None):
""" validate this column: return the compared against itemsize """
# validate this column for string truncation (or reset to the max size)
- if _ensure_decoded(self.kind) == 'string':
+ if _ensure_decoded(self.kind) == "string":
c = self.col
if c is not None:
if itemsize is None:
@@ -1718,8 +1853,9 @@ def validate_col(self, itemsize=None):
"[{cname}] column but\nthis column has a limit of "
"[{c_itemsize}]!\nConsider using min_itemsize to "
"preset the sizes on these columns".format(
- itemsize=itemsize, cname=self.cname,
- c_itemsize=c.itemsize))
+ itemsize=itemsize, cname=self.cname, c_itemsize=c.itemsize
+ )
+ )
return c.itemsize
return None
@@ -1731,8 +1867,8 @@ def validate_attr(self, append):
if existing_kind is not None and existing_kind != self.kind:
raise TypeError(
"incompatible kind in col [{existing} - "
- "{self_kind}]".format(
- existing=existing_kind, self_kind=self.kind))
+ "{self_kind}]".format(existing=existing_kind, self_kind=self.kind)
+ )
def update_info(self, info):
""" set/update the info for this indexable with the key/value
@@ -1747,7 +1883,7 @@ def update_info(self, info):
if key in idx and value is not None and existing_value != value:
# frequency/name just warn
- if key in ['freq', 'index_name']:
+ if key in ["freq", "index_name"]:
ws = attribute_conflict_doc % (key, existing_value, value)
warnings.warn(ws, AttributeConflictWarning, stacklevel=6)
@@ -1760,8 +1896,12 @@ def update_info(self, info):
"invalid info for [{name}] for [{key}], "
"existing_value [{existing_value}] conflicts with "
"new value [{value}]".format(
- name=self.name, key=key,
- existing_value=existing_value, value=value))
+ name=self.name,
+ key=key,
+ existing_value=existing_value,
+ value=value,
+ )
+ )
else:
if value is not None or existing_value is not None:
idx[key] = value
@@ -1788,13 +1928,18 @@ def read_metadata(self, handler):
def validate_metadata(self, handler):
""" validate that kind=category does not change the categories """
- if self.meta == 'category':
+ if self.meta == "category":
new_metadata = self.metadata
cur_metadata = handler.read_metadata(self.cname)
- if (new_metadata is not None and cur_metadata is not None and
- not array_equivalent(new_metadata, cur_metadata)):
- raise ValueError("cannot append a categorical with "
- "different categories to the existing")
+ if (
+ new_metadata is not None
+ and cur_metadata is not None
+ and not array_equivalent(new_metadata, cur_metadata)
+ ):
+ raise ValueError(
+ "cannot append a categorical with "
+ "different categories to the existing"
+ )
def write_metadata(self, handler):
""" set the meta data """
@@ -1810,8 +1955,7 @@ class GenericIndexCol(IndexCol):
def is_indexed(self):
return False
- def convert(self, values, nan_rep, encoding, errors, start=None,
- stop=None):
+ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None):
""" set the values from this selection: take = take ownership
Parameters
@@ -1829,8 +1973,7 @@ def convert(self, values, nan_rep, encoding, errors, start=None,
"""
start = start if start is not None else 0
- stop = (min(stop, self.table.nrows)
- if stop is not None else self.table.nrows)
+ stop = min(stop, self.table.nrows) if stop is not None else self.table.nrows
self.values = Int64Index(np.arange(stop - start))
return self
@@ -1855,17 +1998,17 @@ class DataCol(IndexCol):
meta : a string description of the metadata
metadata : the actual metadata
"""
+
is_an_indexable = False
is_data_indexable = False
- _info_fields = ['tz', 'ordered']
+ _info_fields = ["tz", "ordered"]
@classmethod
- def create_for_block(
- cls, i=None, name=None, cname=None, version=None, **kwargs):
+ def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs):
""" return a new datacol with the block i """
if cname is None:
- cname = name or 'values_block_{idx}'.format(idx=i)
+ cname = name or "values_block_{idx}".format(idx=i)
if name is None:
name = cname
@@ -1881,34 +2024,45 @@ def create_for_block(
return cls(name=name, cname=cname, **kwargs)
- def __init__(self, values=None, kind=None, typ=None,
- cname=None, data=None, meta=None, metadata=None,
- block=None, **kwargs):
- super().__init__(values=values, kind=kind, typ=typ, cname=cname,
- **kwargs)
+ def __init__(
+ self,
+ values=None,
+ kind=None,
+ typ=None,
+ cname=None,
+ data=None,
+ meta=None,
+ metadata=None,
+ block=None,
+ **kwargs
+ ):
+ super().__init__(values=values, kind=kind, typ=typ, cname=cname, **kwargs)
self.dtype = None
- self.dtype_attr = '{name}_dtype'.format(name=self.name)
+ self.dtype_attr = "{name}_dtype".format(name=self.name)
self.meta = meta
- self.meta_attr = '{name}_meta'.format(name=self.name)
+ self.meta_attr = "{name}_meta".format(name=self.name)
self.set_data(data)
self.set_metadata(metadata)
def __repr__(self):
temp = tuple(
- map(pprint_thing,
- (self.name,
- self.cname,
- self.dtype,
- self.kind,
- self.shape)))
- return ','.join(("{key}->{value}".format(key=key, value=value)
- for key, value in zip(
- ['name', 'cname', 'dtype', 'kind', 'shape'], temp)))
+ map(
+ pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
+ )
+ )
+ return ",".join(
+ (
+ "{key}->{value}".format(key=key, value=value)
+ for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
+ )
+ )
def __eq__(self, other):
""" compare 2 col items """
- return all(getattr(self, a, None) == getattr(other, a, None)
- for a in ['name', 'cname', 'dtype', 'pos'])
+ return all(
+ getattr(self, a, None) == getattr(other, a, None)
+ for a in ["name", "cname", "dtype", "pos"]
+ )
def set_data(self, data, dtype=None):
self.data = data
@@ -1937,39 +2091,49 @@ def set_kind(self):
if self.dtype is not None:
dtype = _ensure_decoded(self.dtype)
- if dtype.startswith('string') or dtype.startswith('bytes'):
- self.kind = 'string'
- elif dtype.startswith('float'):
- self.kind = 'float'
- elif dtype.startswith('complex'):
- self.kind = 'complex'
- elif dtype.startswith('int') or dtype.startswith('uint'):
- self.kind = 'integer'
- elif dtype.startswith('date'):
- self.kind = 'datetime'
- elif dtype.startswith('timedelta'):
- self.kind = 'timedelta'
- elif dtype.startswith('bool'):
- self.kind = 'bool'
+ if dtype.startswith("string") or dtype.startswith("bytes"):
+ self.kind = "string"
+ elif dtype.startswith("float"):
+ self.kind = "float"
+ elif dtype.startswith("complex"):
+ self.kind = "complex"
+ elif dtype.startswith("int") or dtype.startswith("uint"):
+ self.kind = "integer"
+ elif dtype.startswith("date"):
+ self.kind = "datetime"
+ elif dtype.startswith("timedelta"):
+ self.kind = "timedelta"
+ elif dtype.startswith("bool"):
+ self.kind = "bool"
else:
raise AssertionError(
"cannot interpret dtype of [{dtype}] in [{obj}]".format(
- dtype=dtype, obj=self))
+ dtype=dtype, obj=self
+ )
+ )
# set my typ if we need
if self.typ is None:
self.typ = getattr(self.description, self.cname, None)
- def set_atom(self, block, block_items, existing_col, min_itemsize,
- nan_rep, info, encoding=None, errors='strict'):
+ def set_atom(
+ self,
+ block,
+ block_items,
+ existing_col,
+ min_itemsize,
+ nan_rep,
+ info,
+ encoding=None,
+ errors="strict",
+ ):
""" create and setup my atom from the block b """
self.values = list(block_items)
# short-cut certain block types
if block.is_categorical:
- return self.set_atom_categorical(block, items=block_items,
- info=info)
+ return self.set_atom_categorical(block, items=block_items, info=info)
elif block.is_datetimetz:
return self.set_atom_datetime64tz(block, info=info)
elif block.is_datetime:
@@ -1982,32 +2146,31 @@ def set_atom(self, block, block_items, existing_col, min_itemsize,
dtype = block.dtype.name
inferred_type = lib.infer_dtype(block.values, skipna=False)
- if inferred_type == 'date':
- raise TypeError(
- "[date] is not implemented as a table column")
- elif inferred_type == 'datetime':
+ if inferred_type == "date":
+ raise TypeError("[date] is not implemented as a table column")
+ elif inferred_type == "datetime":
# after 8260
# this only would be hit for a mutli-timezone dtype
# which is an error
raise TypeError(
- "too many timezones in this block, create separate "
- "data columns"
+ "too many timezones in this block, create separate " "data columns"
)
- elif inferred_type == 'unicode':
- raise TypeError(
- "[unicode] is not implemented as a table column")
+ elif inferred_type == "unicode":
+ raise TypeError("[unicode] is not implemented as a table column")
# this is basically a catchall; if say a datetime64 has nans then will
# end up here ###
- elif inferred_type == 'string' or dtype == 'object':
+ elif inferred_type == "string" or dtype == "object":
self.set_atom_string(
- block, block_items,
+ block,
+ block_items,
existing_col,
min_itemsize,
nan_rep,
encoding,
- errors)
+ errors,
+ )
# set as a data block
else:
@@ -2016,8 +2179,9 @@ def set_atom(self, block, block_items, existing_col, min_itemsize,
def get_atom_string(self, block, itemsize):
return _tables().StringCol(itemsize=itemsize, shape=block.shape[0])
- def set_atom_string(self, block, block_items, existing_col, min_itemsize,
- nan_rep, encoding, errors):
+ def set_atom_string(
+ self, block, block_items, existing_col, min_itemsize, nan_rep, encoding, errors
+ ):
# fill nan items with myself, don't disturb the blocks by
# trying to downcast
block = block.fillna(nan_rep, downcast=False)
@@ -2027,7 +2191,7 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
# see if we have a valid string type
inferred_type = lib.infer_dtype(data.ravel(), skipna=False)
- if inferred_type != 'string':
+ if inferred_type != "string":
# we cannot serialize this data, so report an exception on a column
# by column basis
@@ -2035,11 +2199,12 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
col = block.iget(i)
inferred_type = lib.infer_dtype(col.ravel(), skipna=False)
- if inferred_type != 'string':
+ if inferred_type != "string":
raise TypeError(
"Cannot serialize the column [{item}] because\n"
"its data contents are [{type}] object dtype".format(
- item=item, type=inferred_type)
+ item=item, type=inferred_type
+ )
)
# itemsize is the maximum length of a string (along any dimension)
@@ -2048,8 +2213,9 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
# specified min_itemsize?
if isinstance(min_itemsize, dict):
- min_itemsize = int(min_itemsize.get(
- self.name) or min_itemsize.get('values') or 0)
+ min_itemsize = int(
+ min_itemsize.get(self.name) or min_itemsize.get("values") or 0
+ )
itemsize = max(min_itemsize or 0, itemsize)
# check for column in the values conflicts
@@ -2059,16 +2225,17 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
itemsize = eci
self.itemsize = itemsize
- self.kind = 'string'
+ self.kind = "string"
self.typ = self.get_atom_string(block, itemsize)
- self.set_data(data_converted.astype(
- '|S{size}'.format(size=itemsize), copy=False))
+ self.set_data(
+ data_converted.astype("|S{size}".format(size=itemsize), copy=False)
+ )
def get_atom_coltype(self, kind=None):
""" return the PyTables column class for this column """
if kind is None:
kind = self.kind
- if self.kind.startswith('uint'):
+ if self.kind.startswith("uint"):
col_name = "UInt{name}Col".format(name=kind[4:])
else:
col_name = "{name}Col".format(name=kind.capitalize())
@@ -2080,9 +2247,8 @@ def get_atom_data(self, block, kind=None):
def set_atom_complex(self, block):
self.kind = block.dtype.name
- itemsize = int(self.kind.split('complex')[-1]) // 8
- self.typ = _tables().ComplexCol(
- itemsize=itemsize, shape=block.shape[0])
+ itemsize = int(self.kind.split("complex")[-1]) // 8
+ self.typ = _tables().ComplexCol(itemsize=itemsize, shape=block.shape[0])
self.set_data(block.values.astype(self.typ.type, copy=False))
def set_atom_data(self, block):
@@ -2096,7 +2262,7 @@ def set_atom_categorical(self, block, items, info=None, values=None):
values = block.values
codes = values.codes
- self.kind = 'integer'
+ self.kind = "integer"
self.dtype = codes.dtype.name
if values.ndim > 1:
raise NotImplementedError("only support 1-d categoricals")
@@ -2109,7 +2275,7 @@ def set_atom_categorical(self, block, items, info=None, values=None):
self.set_data(_block_shape(codes))
# write the categories
- self.meta = 'category'
+ self.meta = "category"
self.set_metadata(block.values.categories)
# update the info
@@ -2119,11 +2285,11 @@ def get_atom_datetime64(self, block):
return _tables().Int64Col(shape=block.shape[0])
def set_atom_datetime64(self, block, values=None):
- self.kind = 'datetime64'
+ self.kind = "datetime64"
self.typ = self.get_atom_datetime64(block)
if values is None:
- values = block.values.view('i8')
- self.set_data(values, 'datetime64')
+ values = block.values.view("i8")
+ self.set_data(values, "datetime64")
def set_atom_datetime64tz(self, block, info, values=None):
@@ -2137,23 +2303,23 @@ def set_atom_datetime64tz(self, block, info, values=None):
self.tz = _get_tz(block.values.tz)
self.update_info(info)
- self.kind = 'datetime64'
+ self.kind = "datetime64"
self.typ = self.get_atom_datetime64(block)
- self.set_data(values, 'datetime64')
+ self.set_data(values, "datetime64")
def get_atom_timedelta64(self, block):
return _tables().Int64Col(shape=block.shape[0])
def set_atom_timedelta64(self, block, values=None):
- self.kind = 'timedelta64'
+ self.kind = "timedelta64"
self.typ = self.get_atom_timedelta64(block)
if values is None:
- values = block.values.view('i8')
- self.set_data(values, 'timedelta64')
+ values = block.values.view("i8")
+ self.set_data(values, "timedelta64")
@property
def shape(self):
- return getattr(self.data, 'shape', None)
+ return getattr(self.data, "shape", None)
@property
def cvalues(self):
@@ -2164,19 +2330,19 @@ def validate_attr(self, append):
"""validate that we have the same order as the existing & same dtype"""
if append:
existing_fields = getattr(self.attrs, self.kind_attr, None)
- if (existing_fields is not None and
- existing_fields != list(self.values)):
- raise ValueError("appended items do not match existing items"
- " in table!")
+ if existing_fields is not None and existing_fields != list(self.values):
+ raise ValueError(
+ "appended items do not match existing items" " in table!"
+ )
existing_dtype = getattr(self.attrs, self.dtype_attr, None)
- if (existing_dtype is not None and
- existing_dtype != self.dtype):
- raise ValueError("appended items dtype do not match existing "
- "items dtype in table!")
+ if existing_dtype is not None and existing_dtype != self.dtype:
+ raise ValueError(
+ "appended items dtype do not match existing "
+ "items dtype in table!"
+ )
- def convert(self, values, nan_rep, encoding, errors, start=None,
- stop=None):
+ def convert(self, values, nan_rep, encoding, errors, start=None, stop=None):
"""set the data from this selection (and convert to the correct dtype
if we can)
"""
@@ -2195,27 +2361,28 @@ def convert(self, values, nan_rep, encoding, errors, start=None,
dtype = _ensure_decoded(self.dtype)
# reverse converts
- if dtype == 'datetime64':
+ if dtype == "datetime64":
# recreate with tz if indicated
self.data = _set_tz(self.data, self.tz, coerce=True)
- elif dtype == 'timedelta64':
- self.data = np.asarray(self.data, dtype='m8[ns]')
- elif dtype == 'date':
+ elif dtype == "timedelta64":
+ self.data = np.asarray(self.data, dtype="m8[ns]")
+ elif dtype == "date":
try:
self.data = np.asarray(
- [date.fromordinal(v) for v in self.data], dtype=object)
+ [date.fromordinal(v) for v in self.data], dtype=object
+ )
except ValueError:
self.data = np.asarray(
- [date.fromtimestamp(v) for v in self.data],
- dtype=object)
- elif dtype == 'datetime':
+ [date.fromtimestamp(v) for v in self.data], dtype=object
+ )
+ elif dtype == "datetime":
self.data = np.asarray(
- [datetime.fromtimestamp(v) for v in self.data],
- dtype=object)
+ [datetime.fromtimestamp(v) for v in self.data], dtype=object
+ )
- elif meta == 'category':
+ elif meta == "category":
# we have a categorical
categories = self.metadata
@@ -2236,21 +2403,22 @@ def convert(self, values, nan_rep, encoding, errors, start=None,
categories = categories[~mask]
codes[codes != -1] -= mask.astype(int).cumsum().values
- self.data = Categorical.from_codes(codes,
- categories=categories,
- ordered=self.ordered)
+ self.data = Categorical.from_codes(
+ codes, categories=categories, ordered=self.ordered
+ )
else:
try:
self.data = self.data.astype(dtype, copy=False)
except TypeError:
- self.data = self.data.astype('O', copy=False)
+ self.data = self.data.astype("O", copy=False)
# convert nans / decode
- if _ensure_decoded(self.kind) == 'string':
+ if _ensure_decoded(self.kind) == "string":
self.data = _unconvert_string_array(
- self.data, nan_rep=nan_rep, encoding=encoding, errors=errors)
+ self.data, nan_rep=nan_rep, encoding=encoding, errors=errors
+ )
return self
@@ -2272,6 +2440,7 @@ def set_attr(self):
class DataIndexableCol(DataCol):
""" represent a data column that can be indexed """
+
is_data_indexable = True
def validate_names(self):
@@ -2311,13 +2480,13 @@ class Fixed:
parent : my parent HDFStore
group : the group node where the table resides
"""
+
pandas_kind = None # type: str
obj_type = None # type: Type[Union[DataFrame, Series]]
ndim = None # type: int
is_table = False
- def __init__(self, parent, group, encoding=None, errors='strict',
- **kwargs):
+ def __init__(self, parent, group, encoding=None, errors="strict", **kwargs):
self.parent = parent
self.group = group
self.encoding = _ensure_encoding(encoding)
@@ -2326,15 +2495,13 @@ def __init__(self, parent, group, encoding=None, errors='strict',
@property
def is_old_version(self):
- return (self.version[0] <= 0 and self.version[1] <= 10 and
- self.version[2] < 1)
+ return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
def set_version(self):
""" compute and set our version """
- version = _ensure_decoded(
- getattr(self.group._v_attrs, 'pandas_version', None))
+ version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
try:
- self.version = tuple(int(x) for x in version.split('.'))
+ self.version = tuple(int(x) for x in version.split("."))
if len(self.version) == 2:
self.version = self.version + (0,)
except AttributeError:
@@ -2342,12 +2509,11 @@ def set_version(self):
@property
def pandas_type(self):
- return _ensure_decoded(getattr(self.group._v_attrs,
- 'pandas_type', None))
+ return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
@property
def format_type(self):
- return 'fixed'
+ return "fixed"
def __repr__(self):
""" return a pretty representation of myself """
@@ -2355,10 +2521,10 @@ def __repr__(self):
s = self.shape
if s is not None:
if isinstance(s, (list, tuple)):
- s = "[{shape}]".format(
- shape=','.join(pprint_thing(x) for x in s))
+ s = "[{shape}]".format(shape=",".join(pprint_thing(x) for x in s))
return "{type:12.12} (shape->{shape})".format(
- type=self.pandas_type, shape=s)
+ type=self.pandas_type, shape=s
+ )
return self.pandas_type
def set_object_info(self):
@@ -2426,7 +2592,7 @@ def is_exists(self):
@property
def nrows(self):
- return getattr(self.storable, 'nrows', None)
+ return getattr(self.storable, "nrows", None)
def validate(self, other):
""" validate against an existing storable """
@@ -2450,11 +2616,13 @@ def infer_axes(self):
def read(self, **kwargs):
raise NotImplementedError(
- "cannot read on an abstract storer: subclasses should implement")
+ "cannot read on an abstract storer: subclasses should implement"
+ )
def write(self, **kwargs):
raise NotImplementedError(
- "cannot write on an abstract storer: sublcasses should implement")
+ "cannot write on an abstract storer: sublcasses should implement"
+ )
def delete(self, where=None, start=None, stop=None, **kwargs):
"""
@@ -2471,13 +2639,14 @@ def delete(self, where=None, start=None, stop=None, **kwargs):
class GenericFixed(Fixed):
""" a generified fixed version """
- _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'}
+
+ _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
_reverse_index_map = {v: k for k, v in _index_type_map.items()}
attributes = [] # type: List[str]
# indexer helpders
def _class_to_alias(self, cls):
- return self._index_type_map.get(cls, '')
+ return self._index_type_map.get(cls, "")
def _alias_to_class(self, alias):
if isinstance(alias, type): # pragma: no cover
@@ -2487,17 +2656,20 @@ def _alias_to_class(self, alias):
def _get_index_factory(self, klass):
if klass == DatetimeIndex:
+
def f(values, freq=None, tz=None):
# data are already in UTC, localize and convert if tz present
- result = DatetimeIndex._simple_new(values.values, name=None,
- freq=freq)
+ result = DatetimeIndex._simple_new(values.values, name=None, freq=freq)
if tz is not None:
- result = result.tz_localize('UTC').tz_convert(tz)
+ result = result.tz_localize("UTC").tz_convert(tz)
return result
+
return f
elif klass == PeriodIndex:
+
def f(values, freq=None, tz=None):
return PeriodIndex._simple_new(values, name=None, freq=freq)
+
return f
return klass
@@ -2509,16 +2681,20 @@ def validate_read(self, kwargs):
"""
kwargs = copy.copy(kwargs)
- columns = kwargs.pop('columns', None)
+ columns = kwargs.pop("columns", None)
if columns is not None:
- raise TypeError("cannot pass a column specification when reading "
- "a Fixed format store. this store must be "
- "selected in its entirety")
- where = kwargs.pop('where', None)
+ raise TypeError(
+ "cannot pass a column specification when reading "
+ "a Fixed format store. this store must be "
+ "selected in its entirety"
+ )
+ where = kwargs.pop("where", None)
if where is not None:
- raise TypeError("cannot pass a where specification when reading "
- "from a Fixed format store. this store must be "
- "selected in its entirety")
+ raise TypeError(
+ "cannot pass a where specification when reading "
+ "from a Fixed format store. this store must be "
+ "selected in its entirety"
+ )
return kwargs
@property
@@ -2532,8 +2708,8 @@ def set_attrs(self):
def get_attrs(self):
""" retrieve our attributes """
- self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None))
- self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict'))
+ self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
+ self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
for n in self.attributes:
setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
@@ -2543,16 +2719,17 @@ def write(self, obj, **kwargs):
def read_array(self, key, start=None, stop=None):
""" read an array for the specified node (off of group """
import tables
+
node = getattr(self.group, key)
attrs = node._v_attrs
- transposed = getattr(attrs, 'transposed', False)
+ transposed = getattr(attrs, "transposed", False)
if isinstance(node, tables.VLArray):
ret = node[0][start:stop]
else:
- dtype = getattr(attrs, 'value_type', None)
- shape = getattr(attrs, 'shape', None)
+ dtype = getattr(attrs, "value_type", None)
+ shape = getattr(attrs, "shape", None)
if shape is not None:
# length 0 axis
@@ -2560,13 +2737,13 @@ def read_array(self, key, start=None, stop=None):
else:
ret = node[start:stop]
- if dtype == 'datetime64':
+ if dtype == "datetime64":
# reconstruct a timezone if indicated
- ret = _set_tz(ret, getattr(attrs, 'tz', None), coerce=True)
+ ret = _set_tz(ret, getattr(attrs, "tz", None), coerce=True)
- elif dtype == 'timedelta64':
- ret = np.asarray(ret, dtype='m8[ns]')
+ elif dtype == "timedelta64":
+ ret = np.asarray(ret, dtype="m8[ns]")
if transposed:
return ret.T
@@ -2574,37 +2751,37 @@ def read_array(self, key, start=None, stop=None):
return ret
def read_index(self, key, **kwargs):
- variety = _ensure_decoded(
- getattr(self.attrs, '{key}_variety'.format(key=key)))
+ variety = _ensure_decoded(getattr(self.attrs, "{key}_variety".format(key=key)))
- if variety == 'multi':
+ if variety == "multi":
return self.read_multi_index(key, **kwargs)
- elif variety == 'block':
+ elif variety == "block":
return self.read_block_index(key, **kwargs)
- elif variety == 'sparseint':
+ elif variety == "sparseint":
return self.read_sparse_intindex(key, **kwargs)
- elif variety == 'regular':
+ elif variety == "regular":
_, index = self.read_index_node(getattr(self.group, key), **kwargs)
return index
else: # pragma: no cover
raise TypeError(
- 'unrecognized index variety: {variety}'.format(
- variety=variety))
+ "unrecognized index variety: {variety}".format(variety=variety)
+ )
def write_index(self, key, index):
if isinstance(index, MultiIndex):
- setattr(self.attrs, '{key}_variety'.format(key=key), 'multi')
+ setattr(self.attrs, "{key}_variety".format(key=key), "multi")
self.write_multi_index(key, index)
elif isinstance(index, BlockIndex):
- setattr(self.attrs, '{key}_variety'.format(key=key), 'block')
+ setattr(self.attrs, "{key}_variety".format(key=key), "block")
self.write_block_index(key, index)
elif isinstance(index, IntIndex):
- setattr(self.attrs, '{key}_variety'.format(key=key), 'sparseint')
+ setattr(self.attrs, "{key}_variety".format(key=key), "sparseint")
self.write_sparse_intindex(key, index)
else:
- setattr(self.attrs, '{key}_variety'.format(key=key), 'regular')
- converted = _convert_index(index, self.encoding, self.errors,
- self.format_type).set_name('index')
+ setattr(self.attrs, "{key}_variety".format(key=key), "regular")
+ converted = _convert_index(
+ index, self.encoding, self.errors, self.format_type
+ ).set_name("index")
self.write_array(key, converted.values)
@@ -2615,113 +2792,124 @@ def write_index(self, key, index):
if isinstance(index, (DatetimeIndex, PeriodIndex)):
node._v_attrs.index_class = self._class_to_alias(type(index))
- if hasattr(index, 'freq'):
+ if hasattr(index, "freq"):
node._v_attrs.freq = index.freq
- if hasattr(index, 'tz') and index.tz is not None:
+ if hasattr(index, "tz") and index.tz is not None:
node._v_attrs.tz = _get_tz(index.tz)
def write_block_index(self, key, index):
- self.write_array('{key}_blocs'.format(key=key), index.blocs)
- self.write_array('{key}_blengths'.format(key=key), index.blengths)
- setattr(self.attrs, '{key}_length'.format(key=key), index.length)
+ self.write_array("{key}_blocs".format(key=key), index.blocs)
+ self.write_array("{key}_blengths".format(key=key), index.blengths)
+ setattr(self.attrs, "{key}_length".format(key=key), index.length)
def read_block_index(self, key, **kwargs):
- length = getattr(self.attrs, '{key}_length'.format(key=key))
- blocs = self.read_array('{key}_blocs'.format(key=key), **kwargs)
- blengths = self.read_array('{key}_blengths'.format(key=key), **kwargs)
+ length = getattr(self.attrs, "{key}_length".format(key=key))
+ blocs = self.read_array("{key}_blocs".format(key=key), **kwargs)
+ blengths = self.read_array("{key}_blengths".format(key=key), **kwargs)
return BlockIndex(length, blocs, blengths)
def write_sparse_intindex(self, key, index):
- self.write_array('{key}_indices'.format(key=key), index.indices)
- setattr(self.attrs, '{key}_length'.format(key=key), index.length)
+ self.write_array("{key}_indices".format(key=key), index.indices)
+ setattr(self.attrs, "{key}_length".format(key=key), index.length)
def read_sparse_intindex(self, key, **kwargs):
- length = getattr(self.attrs, '{key}_length'.format(key=key))
- indices = self.read_array('{key}_indices'.format(key=key), **kwargs)
+ length = getattr(self.attrs, "{key}_length".format(key=key))
+ indices = self.read_array("{key}_indices".format(key=key), **kwargs)
return IntIndex(length, indices)
def write_multi_index(self, key, index):
- setattr(self.attrs, '{key}_nlevels'.format(key=key), index.nlevels)
+ setattr(self.attrs, "{key}_nlevels".format(key=key), index.nlevels)
- for i, (lev, level_codes, name) in enumerate(zip(index.levels,
- index.codes,
- index.names)):
+ for i, (lev, level_codes, name) in enumerate(
+ zip(index.levels, index.codes, index.names)
+ ):
# write the level
if is_extension_type(lev):
- raise NotImplementedError("Saving a MultiIndex with an "
- "extension dtype is not supported.")
- level_key = '{key}_level{idx}'.format(key=key, idx=i)
- conv_level = _convert_index(lev, self.encoding, self.errors,
- self.format_type).set_name(level_key)
+ raise NotImplementedError(
+ "Saving a MultiIndex with an " "extension dtype is not supported."
+ )
+ level_key = "{key}_level{idx}".format(key=key, idx=i)
+ conv_level = _convert_index(
+ lev, self.encoding, self.errors, self.format_type
+ ).set_name(level_key)
self.write_array(level_key, conv_level.values)
node = getattr(self.group, level_key)
node._v_attrs.kind = conv_level.kind
node._v_attrs.name = name
# write the name
- setattr(node._v_attrs, '{key}_name{name}'.format(
- key=key, name=name), name)
+ setattr(node._v_attrs, "{key}_name{name}".format(key=key, name=name), name)
# write the labels
- label_key = '{key}_label{idx}'.format(key=key, idx=i)
+ label_key = "{key}_label{idx}".format(key=key, idx=i)
self.write_array(label_key, level_codes)
def read_multi_index(self, key, **kwargs):
- nlevels = getattr(self.attrs, '{key}_nlevels'.format(key=key))
+ nlevels = getattr(self.attrs, "{key}_nlevels".format(key=key))
levels = []
codes = []
names = []
for i in range(nlevels):
- level_key = '{key}_level{idx}'.format(key=key, idx=i)
- name, lev = self.read_index_node(getattr(self.group, level_key),
- **kwargs)
+ level_key = "{key}_level{idx}".format(key=key, idx=i)
+ name, lev = self.read_index_node(getattr(self.group, level_key), **kwargs)
levels.append(lev)
names.append(name)
- label_key = '{key}_label{idx}'.format(key=key, idx=i)
+ label_key = "{key}_label{idx}".format(key=key, idx=i)
level_codes = self.read_array(label_key, **kwargs)
codes.append(level_codes)
- return MultiIndex(levels=levels, codes=codes, names=names,
- verify_integrity=True)
+ return MultiIndex(
+ levels=levels, codes=codes, names=names, verify_integrity=True
+ )
def read_index_node(self, node, start=None, stop=None):
data = node[start:stop]
# If the index was an empty array write_array_empty() will
# have written a sentinel. Here we relace it with the original.
- if ('shape' in node._v_attrs and
- self._is_empty_array(getattr(node._v_attrs, 'shape'))):
- data = np.empty(getattr(node._v_attrs, 'shape'),
- dtype=getattr(node._v_attrs, 'value_type'))
+ if "shape" in node._v_attrs and self._is_empty_array(
+ getattr(node._v_attrs, "shape")
+ ):
+ data = np.empty(
+ getattr(node._v_attrs, "shape"),
+ dtype=getattr(node._v_attrs, "value_type"),
+ )
kind = _ensure_decoded(node._v_attrs.kind)
name = None
- if 'name' in node._v_attrs:
+ if "name" in node._v_attrs:
name = _ensure_str(node._v_attrs.name)
name = _ensure_decoded(name)
- index_class = self._alias_to_class(_ensure_decoded(
- getattr(node._v_attrs, 'index_class', '')))
+ index_class = self._alias_to_class(
+ _ensure_decoded(getattr(node._v_attrs, "index_class", ""))
+ )
factory = self._get_index_factory(index_class)
kwargs = {}
- if 'freq' in node._v_attrs:
- kwargs['freq'] = node._v_attrs['freq']
-
- if 'tz' in node._v_attrs:
- kwargs['tz'] = node._v_attrs['tz']
-
- if kind in ('date', 'datetime'):
- index = factory(_unconvert_index(data, kind,
- encoding=self.encoding,
- errors=self.errors),
- dtype=object, **kwargs)
+ if "freq" in node._v_attrs:
+ kwargs["freq"] = node._v_attrs["freq"]
+
+ if "tz" in node._v_attrs:
+ kwargs["tz"] = node._v_attrs["tz"]
+
+ if kind in ("date", "datetime"):
+ index = factory(
+ _unconvert_index(
+ data, kind, encoding=self.encoding, errors=self.errors
+ ),
+ dtype=object,
+ **kwargs
+ )
else:
- index = factory(_unconvert_index(data, kind,
- encoding=self.encoding,
- errors=self.errors), **kwargs)
+ index = factory(
+ _unconvert_index(
+ data, kind, encoding=self.encoding, errors=self.errors
+ ),
+ **kwargs
+ )
index.name = name
@@ -2749,11 +2937,13 @@ def write_array(self, key, value, items=None):
transposed = False
if is_categorical_dtype(value):
- raise NotImplementedError('Cannot store a category dtype in '
- 'a HDF5 dataset that uses format='
- '"fixed". Use format="table".')
+ raise NotImplementedError(
+ "Cannot store a category dtype in "
+ "a HDF5 dataset that uses format="
+ '"fixed". Use format="table".'
+ )
if not empty_array:
- if hasattr(value, 'T'):
+ if hasattr(value, "T"):
# ExtensionArrays (1d) may not have transpose.
value = value.T
transposed = True
@@ -2769,9 +2959,9 @@ def write_array(self, key, value, items=None):
if atom is not None:
# create an empty chunked array and fill it from value
if not empty_array:
- ca = self._handle.create_carray(self.group, key, atom,
- value.shape,
- filters=self._filters)
+ ca = self._handle.create_carray(
+ self.group, key, atom, value.shape, filters=self._filters
+ )
ca[:] = value
getattr(self.group, key)._v_attrs.transposed = transposed
@@ -2787,7 +2977,7 @@ def write_array(self, key, value, items=None):
inferred_type = lib.infer_dtype(value.ravel(), skipna=False)
if empty_array:
pass
- elif inferred_type == 'string':
+ elif inferred_type == "string":
pass
else:
try:
@@ -2797,32 +2987,26 @@ def write_array(self, key, value, items=None):
ws = performance_doc % (inferred_type, key, items)
warnings.warn(ws, PerformanceWarning, stacklevel=7)
- vlarr = self._handle.create_vlarray(self.group, key,
- _tables().ObjectAtom())
+ vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
vlarr.append(value)
else:
if empty_array:
self.write_array_empty(key, value)
else:
if is_datetime64_dtype(value.dtype):
- self._handle.create_array(
- self.group, key, value.view('i8'))
- getattr(
- self.group, key)._v_attrs.value_type = 'datetime64'
+ self._handle.create_array(self.group, key, value.view("i8"))
+ getattr(self.group, key)._v_attrs.value_type = "datetime64"
elif is_datetime64tz_dtype(value.dtype):
# store as UTC
# with a zone
- self._handle.create_array(self.group, key,
- value.asi8)
+ self._handle.create_array(self.group, key, value.asi8)
node = getattr(self.group, key)
node._v_attrs.tz = _get_tz(value.tz)
- node._v_attrs.value_type = 'datetime64'
+ node._v_attrs.value_type = "datetime64"
elif is_timedelta64_dtype(value.dtype):
- self._handle.create_array(
- self.group, key, value.view('i8'))
- getattr(
- self.group, key)._v_attrs.value_type = 'timedelta64'
+ self._handle.create_array(self.group, key, value.view("i8"))
+ getattr(self.group, key)._v_attrs.value_type = "timedelta64"
else:
self._handle.create_array(self.group, key, value)
@@ -2830,117 +3014,122 @@ def write_array(self, key, value, items=None):
class LegacyFixed(GenericFixed):
-
def read_index_legacy(self, key, start=None, stop=None):
node = getattr(self.group, key)
data = node[start:stop]
kind = node._v_attrs.kind
- return _unconvert_index_legacy(data, kind, encoding=self.encoding,
- errors=self.errors)
+ return _unconvert_index_legacy(
+ data, kind, encoding=self.encoding, errors=self.errors
+ )
class LegacySeriesFixed(LegacyFixed):
-
def read(self, **kwargs):
kwargs = self.validate_read(kwargs)
- index = self.read_index_legacy('index')
- values = self.read_array('values')
+ index = self.read_index_legacy("index")
+ values = self.read_array("values")
return Series(values, index=index)
class LegacyFrameFixed(LegacyFixed):
-
def read(self, **kwargs):
kwargs = self.validate_read(kwargs)
- index = self.read_index_legacy('index')
- columns = self.read_index_legacy('columns')
- values = self.read_array('values')
+ index = self.read_index_legacy("index")
+ columns = self.read_index_legacy("columns")
+ values = self.read_array("values")
return DataFrame(values, index=index, columns=columns)
class SeriesFixed(GenericFixed):
- pandas_kind = 'series'
- attributes = ['name']
+ pandas_kind = "series"
+ attributes = ["name"]
@property
def shape(self):
try:
- return len(getattr(self.group, 'values')),
+ return (len(getattr(self.group, "values")),)
except (TypeError, AttributeError):
return None
def read(self, **kwargs):
kwargs = self.validate_read(kwargs)
- index = self.read_index('index', **kwargs)
- values = self.read_array('values', **kwargs)
+ index = self.read_index("index", **kwargs)
+ values = self.read_array("values", **kwargs)
return Series(values, index=index, name=self.name)
def write(self, obj, **kwargs):
super().write(obj, **kwargs)
- self.write_index('index', obj.index)
- self.write_array('values', obj.values)
+ self.write_index("index", obj.index)
+ self.write_array("values", obj.values)
self.attrs.name = obj.name
class SparseFixed(GenericFixed):
-
def validate_read(self, kwargs):
"""
we don't support start, stop kwds in Sparse
"""
kwargs = super().validate_read(kwargs)
- if 'start' in kwargs or 'stop' in kwargs:
- raise NotImplementedError("start and/or stop are not supported "
- "in fixed Sparse reading")
+ if "start" in kwargs or "stop" in kwargs:
+ raise NotImplementedError(
+ "start and/or stop are not supported " "in fixed Sparse reading"
+ )
return kwargs
class SparseSeriesFixed(SparseFixed):
- pandas_kind = 'sparse_series'
- attributes = ['name', 'fill_value', 'kind']
+ pandas_kind = "sparse_series"
+ attributes = ["name", "fill_value", "kind"]
def read(self, **kwargs):
kwargs = self.validate_read(kwargs)
- index = self.read_index('index')
- sp_values = self.read_array('sp_values')
- sp_index = self.read_index('sp_index')
- return SparseSeries(sp_values, index=index, sparse_index=sp_index,
- kind=self.kind or 'block',
- fill_value=self.fill_value,
- name=self.name)
+ index = self.read_index("index")
+ sp_values = self.read_array("sp_values")
+ sp_index = self.read_index("sp_index")
+ return SparseSeries(
+ sp_values,
+ index=index,
+ sparse_index=sp_index,
+ kind=self.kind or "block",
+ fill_value=self.fill_value,
+ name=self.name,
+ )
def write(self, obj, **kwargs):
super().write(obj, **kwargs)
- self.write_index('index', obj.index)
- self.write_index('sp_index', obj.sp_index)
- self.write_array('sp_values', obj.sp_values)
+ self.write_index("index", obj.index)
+ self.write_index("sp_index", obj.sp_index)
+ self.write_array("sp_values", obj.sp_values)
self.attrs.name = obj.name
self.attrs.fill_value = obj.fill_value
self.attrs.kind = obj.kind
class SparseFrameFixed(SparseFixed):
- pandas_kind = 'sparse_frame'
- attributes = ['default_kind', 'default_fill_value']
+ pandas_kind = "sparse_frame"
+ attributes = ["default_kind", "default_fill_value"]
def read(self, **kwargs):
kwargs = self.validate_read(kwargs)
- columns = self.read_index('columns')
+ columns = self.read_index("columns")
sdict = {}
for c in columns:
- key = 'sparse_series_{columns}'.format(columns=c)
+ key = "sparse_series_{columns}".format(columns=c)
s = SparseSeriesFixed(self.parent, getattr(self.group, key))
s.infer_axes()
sdict[c] = s.read()
- return SparseDataFrame(sdict, columns=columns,
- default_kind=self.default_kind,
- default_fill_value=self.default_fill_value)
+ return SparseDataFrame(
+ sdict,
+ columns=columns,
+ default_kind=self.default_kind,
+ default_fill_value=self.default_fill_value,
+ )
def write(self, obj, **kwargs):
""" write it as a collection of individual sparse series """
super().write(obj, **kwargs)
for name, ss in obj.items():
- key = 'sparse_series_{name}'.format(name=name)
+ key = "sparse_series_{name}".format(name=name)
if key not in self.group._v_children:
node = self._handle.create_group(self.group, key)
else:
@@ -2949,11 +3138,11 @@ def write(self, obj, **kwargs):
s.write(ss)
self.attrs.default_fill_value = obj.default_fill_value
self.attrs.default_kind = obj.default_kind
- self.write_index('columns', obj.columns)
+ self.write_index("columns", obj.columns)
class BlockManagerFixed(GenericFixed):
- attributes = ['ndim', 'nblocks']
+ attributes = ["ndim", "nblocks"]
is_shape_reversed = False
@property
@@ -2964,16 +3153,16 @@ def shape(self):
# items
items = 0
for i in range(self.nblocks):
- node = getattr(self.group, 'block{idx}_items'.format(idx=i))
- shape = getattr(node, 'shape', None)
+ node = getattr(self.group, "block{idx}_items".format(idx=i))
+ shape = getattr(node, "shape", None)
if shape is not None:
items += shape[0]
# data shape
- node = getattr(self.group, 'block0_values')
- shape = getattr(node, 'shape', None)
+ node = getattr(self.group, "block0_values")
+ shape = getattr(node, "shape", None)
if shape is not None:
- shape = list(shape[0:(ndim - 1)])
+ shape = list(shape[0 : (ndim - 1)])
else:
shape = []
@@ -2997,19 +3186,18 @@ def read(self, start=None, stop=None, **kwargs):
for i in range(self.ndim):
_start, _stop = (start, stop) if i == select_axis else (None, None)
- ax = self.read_index('axis{idx}'.format(
- idx=i), start=_start, stop=_stop)
+ ax = self.read_index("axis{idx}".format(idx=i), start=_start, stop=_stop)
axes.append(ax)
items = axes[0]
blocks = []
for i in range(self.nblocks):
- blk_items = self.read_index('block{idx}_items'.format(idx=i))
- values = self.read_array('block{idx}_values'.format(idx=i),
- start=_start, stop=_stop)
- blk = make_block(values,
- placement=items.get_indexer(blk_items))
+ blk_items = self.read_index("block{idx}_items".format(idx=i))
+ values = self.read_array(
+ "block{idx}_values".format(idx=i), start=_start, stop=_stop
+ )
+ blk = make_block(values, placement=items.get_indexer(blk_items))
blocks.append(blk)
return self.obj_type(BlockManager(blocks, axes))
@@ -3024,22 +3212,22 @@ def write(self, obj, **kwargs):
for i, ax in enumerate(data.axes):
if i == 0:
if not ax.is_unique:
- raise ValueError(
- "Columns index has to be unique for fixed format")
- self.write_index('axis{idx}'.format(idx=i), ax)
+ raise ValueError("Columns index has to be unique for fixed format")
+ self.write_index("axis{idx}".format(idx=i), ax)
# Supporting mixed-type DataFrame objects...nontrivial
self.attrs.nblocks = len(data.blocks)
for i, blk in enumerate(data.blocks):
# I have no idea why, but writing values before items fixed #2299
blk_items = data.items.take(blk.mgr_locs)
- self.write_array('block{idx}_values'.format(idx=i),
- blk.values, items=blk_items)
- self.write_index('block{idx}_items'.format(idx=i), blk_items)
+ self.write_array(
+ "block{idx}_values".format(idx=i), blk.values, items=blk_items
+ )
+ self.write_index("block{idx}_items".format(idx=i), blk_items)
class FrameFixed(BlockManagerFixed):
- pandas_kind = 'frame'
+ pandas_kind = "frame"
obj_type = DataFrame
@@ -3068,7 +3256,8 @@ class Table(Fixed):
metadata : the names of the metadata columns
"""
- pandas_kind = 'wide_table'
+
+ pandas_kind = "wide_table"
table_type = None # type: str
levels = 1
is_table = True
@@ -3087,31 +3276,35 @@ def __init__(self, *args, **kwargs):
@property
def table_type_short(self):
- return self.table_type.split('_')[0]
+ return self.table_type.split("_")[0]
@property
def format_type(self):
- return 'table'
+ return "table"
def __repr__(self):
""" return a pretty representation of myself """
self.infer_axes()
- dc = ",dc->[{columns}]".format(columns=(','.join(
- self.data_columns) if len(self.data_columns) else ''))
+ dc = ",dc->[{columns}]".format(
+ columns=(",".join(self.data_columns) if len(self.data_columns) else "")
+ )
- ver = ''
+ ver = ""
if self.is_old_version:
- ver = "[{version}]".format(
- version='.'.join(str(x) for x in self.version))
+ ver = "[{version}]".format(version=".".join(str(x) for x in self.version))
return (
"{pandas_type:12.12}{ver} (typ->{table_type},nrows->{nrows},"
"ncols->{ncols},indexers->[{index_axes}]{dc})".format(
- pandas_type=self.pandas_type, ver=ver,
- table_type=self.table_type_short, nrows=self.nrows,
+ pandas_type=self.pandas_type,
+ ver=ver,
+ table_type=self.table_type_short,
+ nrows=self.nrows,
ncols=self.ncols,
- index_axes=(','.join(a.name for a in self.index_axes)), dc=dc
- ))
+ index_axes=(",".join(a.name for a in self.index_axes)),
+ dc=dc,
+ )
+ )
def __getitem__(self, c):
""" return the axis for c """
@@ -3129,9 +3322,11 @@ def validate(self, other):
raise TypeError(
"incompatible table_type with existing "
"[{other} - {self}]".format(
- other=other.table_type, self=self.table_type))
+ other=other.table_type, self=self.table_type
+ )
+ )
- for c in ['index_axes', 'non_index_axes', 'values_axes']:
+ for c in ["index_axes", "non_index_axes", "values_axes"]:
sv = getattr(self, c, None)
ov = getattr(other, c, None)
if sv != ov:
@@ -3143,12 +3338,15 @@ def validate(self, other):
raise ValueError(
"invalid combinate of [{c}] on appending data "
"[{sax}] vs current table [{oax}]".format(
- c=c, sax=sax, oax=oax))
+ c=c, sax=sax, oax=oax
+ )
+ )
# should never get here
raise Exception(
"invalid combinate of [{c}] on appending data [{sv}] vs "
- "current table [{ov}]".format(c=c, sv=sv, ov=ov))
+ "current table [{ov}]".format(c=c, sv=sv, ov=ov)
+ )
@property
def is_multi_index(self):
@@ -3157,20 +3355,22 @@ def is_multi_index(self):
def validate_metadata(self, existing):
""" create / validate metadata """
- self.metadata = [
- c.name for c in self.values_axes if c.metadata is not None]
+ self.metadata = [c.name for c in self.values_axes if c.metadata is not None]
def validate_multiindex(self, obj):
"""validate that we can store the multi-index; reset and return the
new object
"""
- levels = [l if l is not None else "level_{0}".format(i)
- for i, l in enumerate(obj.index.names)]
+ levels = [
+ l if l is not None else "level_{0}".format(i)
+ for i, l in enumerate(obj.index.names)
+ ]
try:
return obj.reset_index(), levels
except ValueError:
- raise ValueError("duplicate names/columns in the multi-index when "
- "storing as a table")
+ raise ValueError(
+ "duplicate names/columns in the multi-index when " "storing as a table"
+ )
@property
def nrows_expected(self):
@@ -3180,11 +3380,11 @@ def nrows_expected(self):
@property
def is_exists(self):
""" has this table been created """
- return 'table' in self.group
+ return "table" in self.group
@property
def storable(self):
- return getattr(self.group, 'table', None)
+ return getattr(self.group, "table", None)
@property
def table(self):
@@ -3215,19 +3415,28 @@ def is_transposed(self):
@property
def data_orientation(self):
"""return a tuple of my permutated axes, non_indexable at the front"""
- return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes],
- [int(a.axis) for a in self.index_axes]))
+ return tuple(
+ itertools.chain(
+ [int(a[0]) for a in self.non_index_axes],
+ [int(a.axis) for a in self.index_axes],
+ )
+ )
def queryables(self):
""" return a dict of the kinds allowable columns for this object """
# compute the values_axes queryables
return dict(
- [(a.cname, a) for a in self.index_axes] +
- [(self.storage_obj_type._AXIS_NAMES[axis], None)
- for axis, values in self.non_index_axes] +
- [(v.cname, v) for v in self.values_axes
- if v.name in set(self.data_columns)]
+ [(a.cname, a) for a in self.index_axes]
+ + [
+ (self.storage_obj_type._AXIS_NAMES[axis], None)
+ for axis, values in self.non_index_axes
+ ]
+ + [
+ (v.cname, v)
+ for v in self.values_axes
+ if v.name in set(self.data_columns)
+ ]
)
def index_cols(self):
@@ -3240,8 +3449,7 @@ def values_cols(self):
def _get_metadata_path(self, key):
""" return the metadata pathname for this key """
- return "{group}/meta/{key}/meta".format(group=self.group._v_pathname,
- key=key)
+ return "{group}/meta/{key}/meta".format(group=self.group._v_pathname, key=key)
def write_metadata(self, key, values):
"""
@@ -3254,13 +3462,18 @@ def write_metadata(self, key, values):
"""
values = Series(values)
- self.parent.put(self._get_metadata_path(key), values, format='table',
- encoding=self.encoding, errors=self.errors,
- nan_rep=self.nan_rep)
+ self.parent.put(
+ self._get_metadata_path(key),
+ values,
+ format="table",
+ encoding=self.encoding,
+ errors=self.errors,
+ nan_rep=self.nan_rep,
+ )
def read_metadata(self, key):
""" return the meta data array for this key """
- if getattr(getattr(self.group, 'meta', None), key, None) is not None:
+ if getattr(getattr(self.group, "meta", None), key, None) is not None:
return self.parent.select(self._get_metadata_path(key))
return None
@@ -3284,34 +3497,24 @@ def set_attrs(self):
def get_attrs(self):
""" retrieve our attributes """
- self.non_index_axes = getattr(
- self.attrs, 'non_index_axes', None) or []
- self.data_columns = getattr(
- self.attrs, 'data_columns', None) or []
- self.info = getattr(
- self.attrs, 'info', None) or dict()
- self.nan_rep = getattr(self.attrs, 'nan_rep', None)
- self.encoding = _ensure_encoding(
- getattr(self.attrs, 'encoding', None))
- self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict'))
- self.levels = getattr(
- self.attrs, 'levels', None) or []
- self.index_axes = [
- a.infer(self) for a in self.indexables if a.is_an_indexable
- ]
+ self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
+ self.data_columns = getattr(self.attrs, "data_columns", None) or []
+ self.info = getattr(self.attrs, "info", None) or dict()
+ self.nan_rep = getattr(self.attrs, "nan_rep", None)
+ self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
+ self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
+ self.levels = getattr(self.attrs, "levels", None) or []
+ self.index_axes = [a.infer(self) for a in self.indexables if a.is_an_indexable]
self.values_axes = [
a.infer(self) for a in self.indexables if not a.is_an_indexable
]
- self.metadata = getattr(
- self.attrs, 'metadata', None) or []
+ self.metadata = getattr(self.attrs, "metadata", None) or []
def validate_version(self, where=None):
""" are we trying to operate on an old version? """
if where is not None:
- if (self.version[0] <= 0 and self.version[1] <= 10 and
- self.version[2] < 1):
- ws = incompatibility_doc % '.'.join(
- [str(x) for x in self.version])
+ if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1:
+ ws = incompatibility_doc % ".".join([str(x) for x in self.version])
warnings.warn(ws, IncompatibilityWarning)
def validate_min_itemsize(self, min_itemsize):
@@ -3327,12 +3530,13 @@ def validate_min_itemsize(self, min_itemsize):
for k, v in min_itemsize.items():
# ok, apply generally
- if k == 'values':
+ if k == "values":
continue
if k not in q:
raise ValueError(
"min_itemsize has the key [{key}] which is not an axis or "
- "data_column".format(key=k))
+ "data_column".format(key=k)
+ )
@property
def indexables(self):
@@ -3342,10 +3546,12 @@ def indexables(self):
self._indexables = []
# index columns
- self._indexables.extend([
- IndexCol(name=name, axis=axis, pos=i)
- for i, (axis, name) in enumerate(self.attrs.index_cols)
- ])
+ self._indexables.extend(
+ [
+ IndexCol(name=name, axis=axis, pos=i)
+ for i, (axis, name) in enumerate(self.attrs.index_cols)
+ ]
+ )
# values columns
dc = set(self.data_columns)
@@ -3355,11 +3561,13 @@ def f(i, c):
klass = DataCol
if c in dc:
klass = DataIndexableCol
- return klass.create_for_block(i=i, name=c, pos=base_pos + i,
- version=self.version)
+ return klass.create_for_block(
+ i=i, name=c, pos=base_pos + i, version=self.version
+ )
self._indexables.extend(
- [f(i, c) for i, c in enumerate(self.attrs.values_cols)])
+ [f(i, c) for i, c in enumerate(self.attrs.values_cols)]
+ )
return self._indexables
@@ -3395,9 +3603,9 @@ def create_index(self, columns=None, optlevel=None, kind=None):
kw = dict()
if optlevel is not None:
- kw['optlevel'] = optlevel
+ kw["optlevel"] = optlevel
if kind is not None:
- kw['kind'] = kind
+ kw["kind"] = kind
table = self.table
for c in columns:
@@ -3413,23 +3621,24 @@ def create_index(self, columns=None, optlevel=None, kind=None):
if kind is not None and cur_kind != kind:
v.remove_index()
else:
- kw['kind'] = cur_kind
+ kw["kind"] = cur_kind
if optlevel is not None and cur_optlevel != optlevel:
v.remove_index()
else:
- kw['optlevel'] = cur_optlevel
+ kw["optlevel"] = cur_optlevel
# create the index
if not v.is_indexed:
- if v.type.startswith('complex'):
+ if v.type.startswith("complex"):
raise TypeError(
- 'Columns containing complex values can be stored '
- 'but cannot'
- ' be indexed when using table format. Either use '
- 'fixed format, set index=False, or do not include '
- 'the columns containing complex values to '
- 'data_columns when initializing the table.')
+ "Columns containing complex values can be stored "
+ "but cannot"
+ " be indexed when using table format. Either use "
+ "fixed format, set index=False, or do not include "
+ "the columns containing complex values to "
+ "data_columns when initializing the table."
+ )
v.create_index(**kw)
def read_axes(self, where, **kwargs):
@@ -3453,9 +3662,14 @@ def read_axes(self, where, **kwargs):
a.set_info(self.info)
# `kwargs` may contain `start` and `stop` arguments if passed to
# `store.select()`. If set they determine the index size.
- a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding,
- errors=self.errors, start=kwargs.get('start'),
- stop=kwargs.get('stop'))
+ a.convert(
+ values,
+ nan_rep=self.nan_rep,
+ encoding=self.encoding,
+ errors=self.errors,
+ start=kwargs.get("start"),
+ stop=kwargs.get("stop"),
+ )
return True
@@ -3473,9 +3687,11 @@ def validate_data_columns(self, data_columns, min_itemsize):
axis, axis_labels = self.non_index_axes[0]
info = self.info.get(axis, dict())
- if info.get('type') == 'MultiIndex' and data_columns:
- raise ValueError("cannot use a multi-index on axis [{0}] with "
- "data_columns {1}".format(axis, data_columns))
+ if info.get("type") == "MultiIndex" and data_columns:
+ raise ValueError(
+ "cannot use a multi-index on axis [{0}] with "
+ "data_columns {1}".format(axis, data_columns)
+ )
# evaluate the passed data_columns, True == use all columns
# take only valide axis labels
@@ -3488,16 +3704,27 @@ def validate_data_columns(self, data_columns, min_itemsize):
if isinstance(min_itemsize, dict):
existing_data_columns = set(data_columns)
- data_columns.extend([
- k for k in min_itemsize.keys()
- if k != 'values' and k not in existing_data_columns
- ])
+ data_columns.extend(
+ [
+ k
+ for k in min_itemsize.keys()
+ if k != "values" and k not in existing_data_columns
+ ]
+ )
# return valid columns in the order of our axis
return [c for c in data_columns if c in axis_labels]
- def create_axes(self, axes, obj, validate=True, nan_rep=None,
- data_columns=None, min_itemsize=None, **kwargs):
+ def create_axes(
+ self,
+ axes,
+ obj,
+ validate=True,
+ nan_rep=None,
+ data_columns=None,
+ min_itemsize=None,
+ **kwargs
+ ):
""" create and return the axes
legacy tables create an indexable column, indexable index,
non-indexable fields
@@ -3524,8 +3751,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None,
except KeyError:
raise TypeError(
"cannot properly create the storer for: [group->{group},"
- "value->{value}]".format(
- group=self.group._v_name, value=type(obj)))
+ "value->{value}]".format(group=self.group._v_name, value=type(obj))
+ )
# map axes to numbers
axes = [obj._get_axis_number(a) for a in axes]
@@ -3546,7 +3773,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None,
# currently support on ndim-1 axes
if len(axes) != self.ndim - 1:
raise ValueError(
- "currently only support ndim-1 indexers in an AppendableTable")
+ "currently only support ndim-1 indexers in an AppendableTable"
+ )
# create according to the new data
self.non_index_axes = []
@@ -3554,7 +3782,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None,
# nan_representation
if nan_rep is None:
- nan_rep = 'nan'
+ nan_rep = "nan"
self.nan_rep = nan_rep
@@ -3564,9 +3792,11 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None,
if i in axes:
name = obj._AXIS_NAMES[i]
- index_axes_map[i] = _convert_index(
- a, self.encoding, self.errors, self.format_type
- ).set_name(name).set_axis(i)
+ index_axes_map[i] = (
+ _convert_index(a, self.encoding, self.errors, self.format_type)
+ .set_name(name)
+ .set_axis(i)
+ )
else:
# we might be able to change the axes on the appending data if
@@ -3575,18 +3805,20 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None,
if existing_table is not None:
indexer = len(self.non_index_axes)
exist_axis = existing_table.non_index_axes[indexer][1]
- if not array_equivalent(np.array(append_axis),
- np.array(exist_axis)):
+ if not array_equivalent(
+ np.array(append_axis), np.array(exist_axis)
+ ):
# ahah! -> reindex
- if array_equivalent(np.array(sorted(append_axis)),
- np.array(sorted(exist_axis))):
+ if array_equivalent(
+ np.array(sorted(append_axis)), np.array(sorted(exist_axis))
+ ):
append_axis = exist_axis
# the non_index_axes info
info = _get_info(self.info, i)
- info['names'] = list(a.names)
- info['type'] = a.__class__.__name__
+ info["names"] = list(a.names)
+ info["type"] = a.__class__.__name__
self.non_index_axes.append((i, append_axis))
@@ -3614,12 +3846,10 @@ def get_blk_items(mgr, blocks):
blk_items = get_blk_items(block_obj._data, blocks)
if len(self.non_index_axes):
axis, axis_labels = self.non_index_axes[0]
- data_columns = self.validate_data_columns(
- data_columns, min_itemsize)
+ data_columns = self.validate_data_columns(data_columns, min_itemsize)
if len(data_columns):
mgr = block_obj.reindex(
- Index(axis_labels).difference(Index(data_columns)),
- axis=axis
+ Index(axis_labels).difference(Index(data_columns)), axis=axis
)._data
blocks = list(mgr.blocks)
@@ -3631,8 +3861,10 @@ def get_blk_items(mgr, blocks):
# reorder the blocks in the same order as the existing_table if we can
if existing_table is not None:
- by_items = {tuple(b_items.tolist()): (b, b_items)
- for b, b_items in zip(blocks, blk_items)}
+ by_items = {
+ tuple(b_items.tolist()): (b, b_items)
+ for b, b_items in zip(blocks, blk_items)
+ }
new_blocks = []
new_blk_items = []
for ea in existing_table.values_axes:
@@ -3645,8 +3877,9 @@ def get_blk_items(mgr, blocks):
raise ValueError(
"cannot match existing table structure for [{items}] "
"on appending data".format(
- items=(','.join(pprint_thing(item) for
- item in items))))
+ items=(",".join(pprint_thing(item) for item in items))
+ )
+ )
blocks = new_blocks
blk_items = new_blk_items
@@ -3659,8 +3892,7 @@ def get_blk_items(mgr, blocks):
name = None
# we have a data_column
- if (data_columns and len(b_items) == 1 and
- b_items[0] in data_columns):
+ if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
klass = DataIndexableCol
name = b_items[0]
self.data_columns.append(name)
@@ -3674,21 +3906,24 @@ def get_blk_items(mgr, blocks):
raise ValueError(
"Incompatible appended table [{blocks}]"
"with existing table [{table}]".format(
- blocks=blocks,
- table=existing_table.values_axes))
+ blocks=blocks, table=existing_table.values_axes
+ )
+ )
else:
existing_col = None
try:
- col = klass.create_for_block(
- i=i, name=name, version=self.version)
- col.set_atom(block=b, block_items=b_items,
- existing_col=existing_col,
- min_itemsize=min_itemsize,
- nan_rep=nan_rep,
- encoding=self.encoding,
- errors=self.errors,
- info=self.info)
+ col = klass.create_for_block(i=i, name=name, version=self.version)
+ col.set_atom(
+ block=b,
+ block_items=b_items,
+ existing_col=existing_col,
+ min_itemsize=min_itemsize,
+ nan_rep=nan_rep,
+ encoding=self.encoding,
+ errors=self.errors,
+ info=self.info,
+ )
col.set_pos(j)
self.values_axes.append(col)
@@ -3698,7 +3933,9 @@ def get_blk_items(mgr, blocks):
raise Exception(
"cannot find the correct atom type -> "
"[dtype->{name},items->{items}] {detail!s}".format(
- name=b.dtype.name, items=b_items, detail=detail))
+ name=b.dtype.name, items=b_items, detail=detail
+ )
+ )
j += 1
# validate our min_itemsize
@@ -3747,8 +3984,7 @@ def process_filter(field, filt):
filt = filt.union(Index(self.levels))
takers = op(axis_values, filt)
- return obj.loc._getitem_axis(takers,
- axis=axis_number)
+ return obj.loc._getitem_axis(takers, axis=axis_number)
# this might be the name of a file IN an axis
elif field in axis_values:
@@ -3761,38 +3997,42 @@ def process_filter(field, filt):
if isinstance(obj, DataFrame):
axis_number = 1 - axis_number
takers = op(values, filt)
- return obj.loc._getitem_axis(takers,
- axis=axis_number)
+ return obj.loc._getitem_axis(takers, axis=axis_number)
- raise ValueError("cannot find the field [{field}] for "
- "filtering!".format(field=field))
+ raise ValueError(
+ "cannot find the field [{field}] for "
+ "filtering!".format(field=field)
+ )
obj = process_filter(field, filt)
return obj
- def create_description(self, complib=None, complevel=None,
- fletcher32=False, expectedrows=None):
+ def create_description(
+ self, complib=None, complevel=None, fletcher32=False, expectedrows=None
+ ):
""" create the description of the table from the axes & values """
# provided expected rows if its passed
if expectedrows is None:
expectedrows = max(self.nrows_expected, 10000)
- d = dict(name='table', expectedrows=expectedrows)
+ d = dict(name="table", expectedrows=expectedrows)
# description from the axes & values
- d['description'] = {a.cname: a.typ for a in self.axes}
+ d["description"] = {a.cname: a.typ for a in self.axes}
if complib:
if complevel is None:
complevel = self._complevel or 9
filters = _tables().Filters(
- complevel=complevel, complib=complib,
- fletcher32=fletcher32 or self._fletcher32)
- d['filters'] = filters
+ complevel=complevel,
+ complib=complib,
+ fletcher32=fletcher32 or self._fletcher32,
+ )
+ d["filters"] = filters
elif self._filters is not None:
- d['filters'] = self._filters
+ d["filters"] = self._filters
return d
@@ -3809,15 +4049,14 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
return False
# create the selection
- self.selection = Selection(
- self, where=where, start=start, stop=stop, **kwargs)
+ self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs)
coords = self.selection.select_coords()
if self.selection.filter is not None:
for field, op, filt in self.selection.filter.format():
data = self.read_column(
- field, start=coords.min(), stop=coords.max() + 1)
- coords = coords[
- op(data.iloc[coords - coords.min()], filt).values]
+ field, start=coords.min(), stop=coords.max() + 1
+ )
+ coords = coords[op(data.iloc[coords - coords.min()], filt).values]
return Index(coords)
@@ -3834,8 +4073,7 @@ def read_column(self, column, where=None, start=None, stop=None):
return False
if where is not None:
- raise TypeError("read_column does not currently accept a where "
- "clause")
+ raise TypeError("read_column does not currently accept a where " "clause")
# find the axes
for a in self.axes:
@@ -3844,20 +4082,27 @@ def read_column(self, column, where=None, start=None, stop=None):
if not a.is_data_indexable:
raise ValueError(
"column [{column}] can not be extracted individually; "
- "it is not data indexable".format(column=column))
+ "it is not data indexable".format(column=column)
+ )
# column must be an indexable or a data column
c = getattr(self.table.cols, column)
a.set_info(self.info)
- return Series(_set_tz(a.convert(c[start:stop],
- nan_rep=self.nan_rep,
- encoding=self.encoding,
- errors=self.errors
- ).take_data(),
- a.tz, True), name=column)
+ return Series(
+ _set_tz(
+ a.convert(
+ c[start:stop],
+ nan_rep=self.nan_rep,
+ encoding=self.encoding,
+ errors=self.errors,
+ ).take_data(),
+ a.tz,
+ True,
+ ),
+ name=column,
+ )
- raise KeyError(
- "column [{column}] not found in the table".format(column=column))
+ raise KeyError("column [{column}] not found in the table".format(column=column))
class WORMTable(Table):
@@ -3866,7 +4111,8 @@ class WORMTable(Table):
table. writing is a one-time operation the data are stored in a format
that allows for searching the data on disk
"""
- table_type = 'worm'
+
+ table_type = "worm"
def read(self, **kwargs):
""" read the indices and the indexing array, calculate offset rows and
@@ -3889,12 +4135,13 @@ class LegacyTable(Table):
that can be easily searched
"""
+
_indexables = [
- IndexCol(name='index', axis=1, pos=0),
- IndexCol(name='column', axis=2, pos=1, index_kind='columns_kind'),
- DataCol(name='fields', cname='values', kind_attr='fields', pos=2)
+ IndexCol(name="index", axis=1, pos=0),
+ IndexCol(name="column", axis=2, pos=1, index_kind="columns_kind"),
+ DataCol(name="fields", cname="values", kind_attr="fields", pos=2),
] # type: Optional[List[IndexCol]]
- table_type = 'legacy'
+ table_type = "legacy"
ndim = 3
def write(self, **kwargs):
@@ -3911,20 +4158,32 @@ def read(self, where=None, columns=None, **kwargs):
class AppendableTable(LegacyTable):
""" support the new appendable table formats """
- _indexables = None
- table_type = 'appendable'
- def write(self, obj, axes=None, append=False, complib=None,
- complevel=None, fletcher32=None, min_itemsize=None,
- chunksize=None, expectedrows=None, dropna=False, **kwargs):
+ _indexables = None
+ table_type = "appendable"
+
+ def write(
+ self,
+ obj,
+ axes=None,
+ append=False,
+ complib=None,
+ complevel=None,
+ fletcher32=None,
+ min_itemsize=None,
+ chunksize=None,
+ expectedrows=None,
+ dropna=False,
+ **kwargs
+ ):
if not append and self.is_exists:
- self._handle.remove_node(self.group, 'table')
+ self._handle.remove_node(self.group, "table")
# create the axes
- self.create_axes(axes=axes, obj=obj, validate=append,
- min_itemsize=min_itemsize,
- **kwargs)
+ self.create_axes(
+ axes=axes, obj=obj, validate=append, min_itemsize=min_itemsize, **kwargs
+ )
for a in self.axes:
a.validate(self, append)
@@ -3932,10 +4191,12 @@ def write(self, obj, axes=None, append=False, complib=None,
if not self.is_exists:
# create the table
- options = self.create_description(complib=complib,
- complevel=complevel,
- fletcher32=fletcher32,
- expectedrows=expectedrows)
+ options = self.create_description(
+ complib=complib,
+ complevel=complevel,
+ fletcher32=fletcher32,
+ expectedrows=expectedrows,
+ )
# set the table attributes
self.set_attrs()
@@ -3973,7 +4234,7 @@ def write_data(self, chunksize, dropna=False):
# column, otherwise ignore the mask
mask = isna(a.data).all(axis=0)
if isinstance(mask, np.ndarray):
- masks.append(mask.astype('u1', copy=False))
+ masks.append(mask.astype("u1", copy=False))
# consolidate masks
if len(masks):
@@ -3992,13 +4253,13 @@ def write_data(self, chunksize, dropna=False):
# broadcast to all other indexes except myself
if i > 0 and i < nindexes:
- repeater = np.prod(
- [indexes[bi].shape[0] for bi in range(0, i)])
+ repeater = np.prod([indexes[bi].shape[0] for bi in range(0, i)])
idx = np.tile(idx, repeater)
if i < nindexes - 1:
- repeater = np.prod([indexes[bi].shape[0]
- for bi in range(i + 1, nindexes)])
+ repeater = np.prod(
+ [indexes[bi].shape[0] for bi in range(i + 1, nindexes)]
+ )
idx = np.repeat(idx, repeater)
bindexes.append(idx)
@@ -4006,8 +4267,7 @@ def write_data(self, chunksize, dropna=False):
# transpose the values so first dimension is last
# reshape the values if needed
values = [a.take_data() for a in self.values_axes]
- values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1))
- for v in values]
+ values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
bvalues = []
for i, v in enumerate(values):
new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
@@ -4029,7 +4289,8 @@ def write_data(self, chunksize, dropna=False):
rows,
indexes=[a[start_i:end_i] for a in bindexes],
mask=mask[start_i:end_i] if mask is not None else None,
- values=[v[start_i:end_i] for v in bvalues])
+ values=[v[start_i:end_i] for v in bvalues],
+ )
def write_data_chunk(self, rows, indexes, mask, values):
"""
@@ -4068,8 +4329,7 @@ def write_data_chunk(self, rows, indexes, mask, values):
rows = rows[m]
except Exception as detail:
- raise Exception(
- "cannot create row-data -> {detail}".format(detail=detail))
+ raise Exception("cannot create row-data -> {detail}".format(detail=detail))
try:
if len(rows):
@@ -4077,8 +4337,8 @@ def write_data_chunk(self, rows, indexes, mask, values):
self.table.flush()
except Exception as detail:
raise TypeError(
- "tables cannot write this data -> {detail}".format(
- detail=detail))
+ "tables cannot write this data -> {detail}".format(detail=detail)
+ )
def delete(self, where=None, start=None, stop=None, **kwargs):
@@ -4101,8 +4361,7 @@ def delete(self, where=None, start=None, stop=None, **kwargs):
# create the selection
table = self.table
- self.selection = Selection(
- self, where, start=start, stop=stop, **kwargs)
+ self.selection = Selection(self, where, start=start, stop=stop, **kwargs)
values = self.selection.select_coords()
# delete the rows in reverse order
@@ -4131,8 +4390,9 @@ def delete(self, where=None, start=None, stop=None, **kwargs):
pg = groups.pop()
for g in reversed(groups):
rows = sorted_series.take(range(g, pg))
- table.remove_rows(start=rows[rows.index[0]
- ], stop=rows[rows.index[-1]] + 1)
+ table.remove_rows(
+ start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
+ )
pg = g
self.table.flush()
@@ -4143,8 +4403,9 @@ def delete(self, where=None, start=None, stop=None, **kwargs):
class AppendableFrameTable(AppendableTable):
""" support the new appendable table formats """
- pandas_kind = 'frame_table'
- table_type = 'appendable_frame'
+
+ pandas_kind = "frame_table"
+ table_type = "appendable_frame"
ndim = 2
obj_type = DataFrame # type: Type[Union[DataFrame, Series]]
@@ -4163,29 +4424,32 @@ def read(self, where=None, columns=None, **kwargs):
if not self.read_axes(where=where, **kwargs):
return None
- info = (self.info.get(self.non_index_axes[0][0], dict())
- if len(self.non_index_axes) else dict())
+ info = (
+ self.info.get(self.non_index_axes[0][0], dict())
+ if len(self.non_index_axes)
+ else dict()
+ )
index = self.index_axes[0].values
frames = []
for a in self.values_axes:
# we could have a multi-index constructor here
# ensure_index doesn't recognized our list-of-tuples here
- if info.get('type') == 'MultiIndex':
+ if info.get("type") == "MultiIndex":
cols = MultiIndex.from_tuples(a.values)
else:
cols = Index(a.values)
- names = info.get('names')
+ names = info.get("names")
if names is not None:
cols.set_names(names, inplace=True)
if self.is_transposed:
values = a.cvalues
index_ = cols
- cols_ = Index(index, name=getattr(index, 'name', None))
+ cols_ = Index(index, name=getattr(index, "name", None))
else:
values = a.cvalues.T
- index_ = Index(index, name=getattr(index, 'name', None))
+ index_ = Index(index, name=getattr(index, "name", None))
cols_ = cols
# if we have a DataIndexableCol, its shape will only be 1 dim
@@ -4209,8 +4473,9 @@ def read(self, where=None, columns=None, **kwargs):
class AppendableSeriesTable(AppendableFrameTable):
""" support the new appendable table formats """
- pandas_kind = 'series_table'
- table_type = 'appendable_series'
+
+ pandas_kind = "series_table"
+ table_type = "appendable_series"
ndim = 2
obj_type = Series
storage_obj_type = DataFrame
@@ -4225,11 +4490,10 @@ def get_object(self, obj):
def write(self, obj, data_columns=None, **kwargs):
""" we are going to write this as a frame table """
if not isinstance(obj, DataFrame):
- name = obj.name or 'values'
+ name = obj.name or "values"
obj = DataFrame({name: obj}, index=obj.index)
obj.columns = [name]
- return super().write(obj=obj, data_columns=obj.columns.tolist(),
- **kwargs)
+ return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
def read(self, columns=None, **kwargs):
@@ -4245,19 +4509,20 @@ def read(self, columns=None, **kwargs):
s = s.iloc[:, 0]
# remove the default name
- if s.name == 'values':
+ if s.name == "values":
s.name = None
return s
class AppendableMultiSeriesTable(AppendableSeriesTable):
""" support the new appendable table formats """
- pandas_kind = 'series_table'
- table_type = 'appendable_multiseries'
+
+ pandas_kind = "series_table"
+ table_type = "appendable_multiseries"
def write(self, obj, **kwargs):
""" we are going to write this as a frame table """
- name = obj.name or 'values'
+ name = obj.name or "values"
obj, self.levels = self.validate_multiindex(obj)
cols = list(self.levels)
cols.append(name)
@@ -4267,8 +4532,9 @@ def write(self, obj, **kwargs):
class GenericTable(AppendableFrameTable):
""" a table that read/writes the generic pytables table format """
- pandas_kind = 'frame_table'
- table_type = 'generic_table'
+
+ pandas_kind = "frame_table"
+ table_type = "generic_table"
ndim = 2
obj_type = DataFrame
@@ -4278,7 +4544,7 @@ def pandas_type(self):
@property
def storable(self):
- return getattr(self.group, 'table', None) or self.group
+ return getattr(self.group, "table", None) or self.group
def get_attrs(self):
""" retrieve our attributes """
@@ -4286,10 +4552,10 @@ def get_attrs(self):
self.nan_rep = None
self.levels = []
- self.index_axes = [a.infer(self)
- for a in self.indexables if a.is_an_indexable]
- self.values_axes = [a.infer(self)
- for a in self.indexables if not a.is_an_indexable]
+ self.index_axes = [a.infer(self) for a in self.indexables if a.is_an_indexable]
+ self.values_axes = [
+ a.infer(self) for a in self.indexables if not a.is_an_indexable
+ ]
self.data_columns = [a.name for a in self.values_axes]
@property
@@ -4300,12 +4566,13 @@ def indexables(self):
d = self.description
# the index columns is just a simple index
- self._indexables = [GenericIndexCol(name='index', axis=0)]
+ self._indexables = [GenericIndexCol(name="index", axis=0)]
for i, n in enumerate(d._v_names):
dc = GenericDataIndexableCol(
- name=n, pos=i, values=[n], version=self.version)
+ name=n, pos=i, values=[n], version=self.version
+ )
self._indexables.append(dc)
return self._indexables
@@ -4317,14 +4584,15 @@ def write(self, **kwargs):
class AppendableMultiFrameTable(AppendableFrameTable):
""" a frame with a multi-index """
- table_type = 'appendable_multiframe'
+
+ table_type = "appendable_multiframe"
obj_type = DataFrame
ndim = 2
_re_levels = re.compile(r"^level_\d+$")
@property
def table_type_short(self):
- return 'appendable_multi'
+ return "appendable_multi"
def write(self, obj, data_columns=None, **kwargs):
if data_columns is None:
@@ -4343,9 +4611,9 @@ def read(self, **kwargs):
df = df.set_index(self.levels)
# remove names for 'level_%d'
- df.index = df.index.set_names([
- None if self._re_levels.search(l) else l for l in df.index.names
- ])
+ df.index = df.index.set_names(
+ [None if self._re_levels.search(l) else l for l in df.index.names]
+ )
return df
@@ -4379,6 +4647,7 @@ def _get_info(info, name):
idx = info[name] = dict()
return idx
+
# tz to/from coercion
@@ -4404,146 +4673,172 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False):
coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
"""
if tz is not None:
- name = getattr(values, 'name', None)
+ name = getattr(values, "name", None)
values = values.ravel()
tz = timezones.get_timezone(_ensure_decoded(tz))
values = DatetimeIndex(values, name=name)
if values.tz is None:
- values = values.tz_localize('UTC').tz_convert(tz)
+ values = values.tz_localize("UTC").tz_convert(tz)
if preserve_UTC:
- if tz == 'UTC':
+ if tz == "UTC":
values = list(values)
elif coerce:
- values = np.asarray(values, dtype='M8[ns]')
+ values = np.asarray(values, dtype="M8[ns]")
return values
-def _convert_index(index, encoding=None, errors='strict', format_type=None):
- index_name = getattr(index, 'name', None)
+def _convert_index(index, encoding=None, errors="strict", format_type=None):
+ index_name = getattr(index, "name", None)
if isinstance(index, DatetimeIndex):
converted = index.asi8
- return IndexCol(converted, 'datetime64', _tables().Int64Col(),
- freq=getattr(index, 'freq', None),
- tz=getattr(index, 'tz', None),
- index_name=index_name)
+ return IndexCol(
+ converted,
+ "datetime64",
+ _tables().Int64Col(),
+ freq=getattr(index, "freq", None),
+ tz=getattr(index, "tz", None),
+ index_name=index_name,
+ )
elif isinstance(index, TimedeltaIndex):
converted = index.asi8
- return IndexCol(converted, 'timedelta64', _tables().Int64Col(),
- freq=getattr(index, 'freq', None),
- index_name=index_name)
+ return IndexCol(
+ converted,
+ "timedelta64",
+ _tables().Int64Col(),
+ freq=getattr(index, "freq", None),
+ index_name=index_name,
+ )
elif isinstance(index, (Int64Index, PeriodIndex)):
atom = _tables().Int64Col()
# avoid to store ndarray of Period objects
- return IndexCol(index._ndarray_values, 'integer', atom,
- freq=getattr(index, 'freq', None),
- index_name=index_name)
+ return IndexCol(
+ index._ndarray_values,
+ "integer",
+ atom,
+ freq=getattr(index, "freq", None),
+ index_name=index_name,
+ )
if isinstance(index, MultiIndex):
- raise TypeError('MultiIndex not supported here!')
+ raise TypeError("MultiIndex not supported here!")
inferred_type = lib.infer_dtype(index, skipna=False)
values = np.asarray(index)
- if inferred_type == 'datetime64':
- converted = values.view('i8')
- return IndexCol(converted, 'datetime64', _tables().Int64Col(),
- freq=getattr(index, 'freq', None),
- tz=getattr(index, 'tz', None),
- index_name=index_name)
- elif inferred_type == 'timedelta64':
- converted = values.view('i8')
- return IndexCol(converted, 'timedelta64', _tables().Int64Col(),
- freq=getattr(index, 'freq', None),
- index_name=index_name)
- elif inferred_type == 'datetime':
- converted = np.asarray([(time.mktime(v.timetuple()) +
- v.microsecond / 1E6) for v in values],
- dtype=np.float64)
- return IndexCol(converted, 'datetime', _tables().Time64Col(),
- index_name=index_name)
- elif inferred_type == 'date':
- converted = np.asarray([v.toordinal() for v in values],
- dtype=np.int32)
- return IndexCol(converted, 'date', _tables().Time32Col(),
- index_name=index_name)
- elif inferred_type == 'string':
+ if inferred_type == "datetime64":
+ converted = values.view("i8")
+ return IndexCol(
+ converted,
+ "datetime64",
+ _tables().Int64Col(),
+ freq=getattr(index, "freq", None),
+ tz=getattr(index, "tz", None),
+ index_name=index_name,
+ )
+ elif inferred_type == "timedelta64":
+ converted = values.view("i8")
+ return IndexCol(
+ converted,
+ "timedelta64",
+ _tables().Int64Col(),
+ freq=getattr(index, "freq", None),
+ index_name=index_name,
+ )
+ elif inferred_type == "datetime":
+ converted = np.asarray(
+ [(time.mktime(v.timetuple()) + v.microsecond / 1e6) for v in values],
+ dtype=np.float64,
+ )
+ return IndexCol(
+ converted, "datetime", _tables().Time64Col(), index_name=index_name
+ )
+ elif inferred_type == "date":
+ converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
+ return IndexCol(converted, "date", _tables().Time32Col(), index_name=index_name)
+ elif inferred_type == "string":
# atom = _tables().ObjectAtom()
# return np.asarray(values, dtype='O'), 'object', atom
converted = _convert_string_array(values, encoding, errors)
itemsize = converted.dtype.itemsize
return IndexCol(
- converted, 'string', _tables().StringCol(itemsize),
- itemsize=itemsize, index_name=index_name
+ converted,
+ "string",
+ _tables().StringCol(itemsize),
+ itemsize=itemsize,
+ index_name=index_name,
)
- elif inferred_type == 'unicode':
- if format_type == 'fixed':
+ elif inferred_type == "unicode":
+ if format_type == "fixed":
atom = _tables().ObjectAtom()
- return IndexCol(np.asarray(values, dtype='O'), 'object', atom,
- index_name=index_name)
+ return IndexCol(
+ np.asarray(values, dtype="O"), "object", atom, index_name=index_name
+ )
raise TypeError(
- "[unicode] is not supported as a in index type for [{0}] formats"
- .format(format_type)
+ "[unicode] is not supported as a in index type for [{0}] formats".format(
+ format_type
+ )
)
- elif inferred_type == 'integer':
+ elif inferred_type == "integer":
# take a guess for now, hope the values fit
atom = _tables().Int64Col()
- return IndexCol(np.asarray(values, dtype=np.int64), 'integer', atom,
- index_name=index_name)
- elif inferred_type == 'floating':
+ return IndexCol(
+ np.asarray(values, dtype=np.int64), "integer", atom, index_name=index_name
+ )
+ elif inferred_type == "floating":
atom = _tables().Float64Col()
- return IndexCol(np.asarray(values, dtype=np.float64), 'float', atom,
- index_name=index_name)
+ return IndexCol(
+ np.asarray(values, dtype=np.float64), "float", atom, index_name=index_name
+ )
else: # pragma: no cover
atom = _tables().ObjectAtom()
- return IndexCol(np.asarray(values, dtype='O'), 'object', atom,
- index_name=index_name)
+ return IndexCol(
+ np.asarray(values, dtype="O"), "object", atom, index_name=index_name
+ )
-def _unconvert_index(data, kind, encoding=None, errors='strict'):
+def _unconvert_index(data, kind, encoding=None, errors="strict"):
kind = _ensure_decoded(kind)
- if kind == 'datetime64':
+ if kind == "datetime64":
index = DatetimeIndex(data)
- elif kind == 'timedelta64':
+ elif kind == "timedelta64":
index = TimedeltaIndex(data)
- elif kind == 'datetime':
- index = np.asarray([datetime.fromtimestamp(v) for v in data],
- dtype=object)
- elif kind == 'date':
+ elif kind == "datetime":
+ index = np.asarray([datetime.fromtimestamp(v) for v in data], dtype=object)
+ elif kind == "date":
try:
- index = np.asarray(
- [date.fromordinal(v) for v in data], dtype=object)
+ index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
except (ValueError):
- index = np.asarray(
- [date.fromtimestamp(v) for v in data], dtype=object)
- elif kind in ('integer', 'float'):
+ index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
+ elif kind in ("integer", "float"):
index = np.asarray(data)
- elif kind in ('string'):
- index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
- errors=errors)
- elif kind == 'object':
+ elif kind in ("string"):
+ index = _unconvert_string_array(
+ data, nan_rep=None, encoding=encoding, errors=errors
+ )
+ elif kind == "object":
index = np.asarray(data[0])
else: # pragma: no cover
- raise ValueError('unrecognized index type {kind}'.format(kind=kind))
+ raise ValueError("unrecognized index type {kind}".format(kind=kind))
return index
-def _unconvert_index_legacy(data, kind, legacy=False, encoding=None,
- errors='strict'):
+def _unconvert_index_legacy(data, kind, legacy=False, encoding=None, errors="strict"):
kind = _ensure_decoded(kind)
- if kind == 'datetime':
+ if kind == "datetime":
index = to_datetime(data)
- elif kind in ('integer'):
+ elif kind in ("integer"):
index = np.asarray(data, dtype=object)
- elif kind in ('string'):
- index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
- errors=errors)
+ elif kind in ("string"):
+ index = _unconvert_string_array(
+ data, nan_rep=None, encoding=encoding, errors=errors
+ )
else: # pragma: no cover
- raise ValueError('unrecognized index type {kind}'.format(kind=kind))
+ raise ValueError("unrecognized index type {kind}".format(kind=kind))
return index
@@ -4566,8 +4861,9 @@ def _convert_string_array(data, encoding, errors, itemsize=None):
# encode if needed
if encoding is not None and len(data):
- data = Series(data.ravel()).str.encode(
- encoding, errors).values.reshape(data.shape)
+ data = (
+ Series(data.ravel()).str.encode(encoding, errors).values.reshape(data.shape)
+ )
# create the sized dtype
if itemsize is None:
@@ -4578,8 +4874,7 @@ def _convert_string_array(data, encoding, errors, itemsize=None):
return data
-def _unconvert_string_array(data, nan_rep=None, encoding=None,
- errors='strict'):
+def _unconvert_string_array(data, nan_rep=None, encoding=None, errors="strict"):
"""
inverse of _convert_string_array
@@ -4612,7 +4907,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None,
data = data.astype(dtype, copy=False).astype(object, copy=False)
if nan_rep is None:
- nan_rep = 'nan'
+ nan_rep = "nan"
data = libwriters.string_array_replace_from_nan_rep(data, nan_rep)
return data.reshape(shape)
@@ -4628,20 +4923,19 @@ def _maybe_convert(values, val_kind, encoding, errors):
def _get_converter(kind, encoding, errors):
kind = _ensure_decoded(kind)
- if kind == 'datetime64':
- return lambda x: np.asarray(x, dtype='M8[ns]')
- elif kind == 'datetime':
+ if kind == "datetime64":
+ return lambda x: np.asarray(x, dtype="M8[ns]")
+ elif kind == "datetime":
return lambda x: to_datetime(x, cache=True).to_pydatetime()
- elif kind == 'string':
- return lambda x: _unconvert_string_array(x, encoding=encoding,
- errors=errors)
+ elif kind == "string":
+ return lambda x: _unconvert_string_array(x, encoding=encoding, errors=errors)
else: # pragma: no cover
- raise ValueError('invalid kind {kind}'.format(kind=kind))
+ raise ValueError("invalid kind {kind}".format(kind=kind))
def _need_convert(kind):
kind = _ensure_decoded(kind)
- if kind in ('datetime', 'datetime64', 'string'):
+ if kind in ("datetime", "datetime64", "string"):
return True
return False
@@ -4674,7 +4968,7 @@ def __init__(self, table, where=None, start=None, stop=None):
# see if we have a passed coordinate like
try:
inferred = lib.infer_dtype(where, skipna=False)
- if inferred == 'integer' or inferred == 'boolean':
+ if inferred == "integer" or inferred == "boolean":
where = np.asarray(where)
if where.dtype == np.bool_:
start, stop = self.start, self.stop
@@ -4684,13 +4978,11 @@ def __init__(self, table, where=None, start=None, stop=None):
stop = self.table.nrows
self.coordinates = np.arange(start, stop)[where]
elif issubclass(where.dtype.type, np.integer):
- if ((self.start is not None and
- (where < self.start).any()) or
- (self.stop is not None and
- (where >= self.stop).any())):
+ if (self.start is not None and (where < self.start).any()) or (
+ self.stop is not None and (where >= self.stop).any()
+ ):
raise ValueError(
- "where must have index locations >= start and "
- "< stop"
+ "where must have index locations >= start and " "< stop"
)
self.coordinates = where
@@ -4723,8 +5015,9 @@ def generate(self, where):
"reference to\n"
" an axis (e.g. 'index' or 'columns'), or a "
"data_column\n"
- " The currently defined references are: {1}\n"
- .format(where, ','.join(q.keys()))
+ " The currently defined references are: {1}\n".format(
+ where, ",".join(q.keys())
+ )
)
def select(self):
@@ -4732,9 +5025,9 @@ def select(self):
generate the selection
"""
if self.condition is not None:
- return self.table.table.read_where(self.condition.format(),
- start=self.start,
- stop=self.stop)
+ return self.table.table.read_where(
+ self.condition.format(), start=self.start, stop=self.stop
+ )
elif self.coordinates is not None:
return self.table.table.read_coordinates(self.coordinates)
return self.table.table.read(start=self.start, stop=self.stop)
@@ -4755,9 +5048,9 @@ def select_coords(self):
stop += nrows
if self.condition is not None:
- return self.table.table.get_where_list(self.condition.format(),
- start=start, stop=stop,
- sort=True)
+ return self.table.table.get_where_list(
+ self.condition.format(), start=start, stop=stop, sort=True
+ )
elif self.coordinates is not None:
return self.coordinates
diff --git a/pandas/io/s3.py b/pandas/io/s3.py
index d784e8d473aac..0a7c082fec51c 100644
--- a/pandas/io/s3.py
+++ b/pandas/io/s3.py
@@ -4,8 +4,7 @@
from pandas.compat._optional import import_optional_dependency
s3fs = import_optional_dependency(
- "s3fs",
- extra="The s3fs package is required to handle s3 files."
+ "s3fs", extra="The s3fs package is required to handle s3 files."
)
@@ -15,12 +14,13 @@ def _strip_schema(url):
return result.netloc + result.path
-def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
- compression=None, mode=None):
+def get_filepath_or_buffer(
+ filepath_or_buffer, encoding=None, compression=None, mode=None
+):
from botocore.exceptions import NoCredentialsError
if mode is None:
- mode = 'rb'
+ mode = "rb"
fs = s3fs.S3FileSystem(anon=False)
try:
@@ -33,6 +33,5 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
# A NoCredentialsError is raised if you don't have creds
# for that bucket.
fs = s3fs.S3FileSystem(anon=True)
- filepath_or_buffer = fs.open(
- _strip_schema(filepath_or_buffer), mode)
+ filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
return filepath_or_buffer, None, compression, True
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
index 634bdfa93ba2e..7cc9dc11a8ccc 100644
--- a/pandas/io/sas/sas7bdat.py
+++ b/pandas/io/sas/sas7bdat.py
@@ -65,9 +65,17 @@ class SAS7BDATReader(BaseIterator):
bytes.
"""
- def __init__(self, path_or_buf, index=None, convert_dates=True,
- blank_missing=True, chunksize=None, encoding=None,
- convert_text=True, convert_header_text=True):
+ def __init__(
+ self,
+ path_or_buf,
+ index=None,
+ convert_dates=True,
+ blank_missing=True,
+ chunksize=None,
+ encoding=None,
+ convert_text=True,
+ convert_header_text=True,
+ ):
self.index = index
self.convert_dates = convert_dates
@@ -96,7 +104,7 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
if isinstance(self._path_or_buf, str):
- self._path_or_buf = open(self._path_or_buf, 'rb')
+ self._path_or_buf = open(self._path_or_buf, "rb")
self.handle = self._path_or_buf
self._get_properties()
@@ -113,7 +121,7 @@ def column_data_offsets(self):
def column_types(self):
"""Returns a numpy character array of the column types:
s (string) or d (double)"""
- return np.asarray(self._column_types, dtype=np.dtype('S1'))
+ return np.asarray(self._column_types, dtype=np.dtype("S1"))
def close(self):
try:
@@ -126,7 +134,7 @@ def _get_properties(self):
# Check magic number
self._path_or_buf.seek(0)
self._cached_page = self._path_or_buf.read(288)
- if self._cached_page[0:len(const.magic)] != const.magic:
+ if self._cached_page[0 : len(const.magic)] != const.magic:
self.close()
raise ValueError("magic number mismatch (not a SAS file?)")
@@ -150,9 +158,8 @@ def _get_properties(self):
total_align = align1 + align2
# Get endianness information
- buf = self._read_bytes(const.endianness_offset,
- const.endianness_length)
- if buf == b'\x01':
+ buf = self._read_bytes(const.endianness_offset, const.endianness_length)
+ if buf == b"\x01":
self.byte_order = "<"
else:
self.byte_order = ">"
@@ -166,36 +173,39 @@ def _get_properties(self):
# Get platform information
buf = self._read_bytes(const.platform_offset, const.platform_length)
- if buf == b'1':
+ if buf == b"1":
self.platform = "unix"
- elif buf == b'2':
+ elif buf == b"2":
self.platform = "windows"
else:
self.platform = "unknown"
buf = self._read_bytes(const.dataset_offset, const.dataset_length)
- self.name = buf.rstrip(b'\x00 ')
+ self.name = buf.rstrip(b"\x00 ")
if self.convert_header_text:
- self.name = self.name.decode(
- self.encoding or self.default_encoding)
+ self.name = self.name.decode(self.encoding or self.default_encoding)
buf = self._read_bytes(const.file_type_offset, const.file_type_length)
- self.file_type = buf.rstrip(b'\x00 ')
+ self.file_type = buf.rstrip(b"\x00 ")
if self.convert_header_text:
self.file_type = self.file_type.decode(
- self.encoding or self.default_encoding)
+ self.encoding or self.default_encoding
+ )
# Timestamp is epoch 01/01/1960
epoch = datetime(1960, 1, 1)
- x = self._read_float(const.date_created_offset + align1,
- const.date_created_length)
- self.date_created = epoch + pd.to_timedelta(x, unit='s')
- x = self._read_float(const.date_modified_offset + align1,
- const.date_modified_length)
- self.date_modified = epoch + pd.to_timedelta(x, unit='s')
-
- self.header_length = self._read_int(const.header_size_offset + align1,
- const.header_size_length)
+ x = self._read_float(
+ const.date_created_offset + align1, const.date_created_length
+ )
+ self.date_created = epoch + pd.to_timedelta(x, unit="s")
+ x = self._read_float(
+ const.date_modified_offset + align1, const.date_modified_length
+ )
+ self.date_modified = epoch + pd.to_timedelta(x, unit="s")
+
+ self.header_length = self._read_int(
+ const.header_size_offset + align1, const.header_size_length
+ )
# Read the rest of the header into cached_page.
buf = self._path_or_buf.read(self.header_length - 288)
@@ -204,44 +214,53 @@ def _get_properties(self):
self.close()
raise ValueError("The SAS7BDAT file appears to be truncated.")
- self._page_length = self._read_int(const.page_size_offset + align1,
- const.page_size_length)
- self._page_count = self._read_int(const.page_count_offset + align1,
- const.page_count_length)
-
- buf = self._read_bytes(const.sas_release_offset + total_align,
- const.sas_release_length)
- self.sas_release = buf.rstrip(b'\x00 ')
+ self._page_length = self._read_int(
+ const.page_size_offset + align1, const.page_size_length
+ )
+ self._page_count = self._read_int(
+ const.page_count_offset + align1, const.page_count_length
+ )
+
+ buf = self._read_bytes(
+ const.sas_release_offset + total_align, const.sas_release_length
+ )
+ self.sas_release = buf.rstrip(b"\x00 ")
if self.convert_header_text:
self.sas_release = self.sas_release.decode(
- self.encoding or self.default_encoding)
+ self.encoding or self.default_encoding
+ )
- buf = self._read_bytes(const.sas_server_type_offset + total_align,
- const.sas_server_type_length)
- self.server_type = buf.rstrip(b'\x00 ')
+ buf = self._read_bytes(
+ const.sas_server_type_offset + total_align, const.sas_server_type_length
+ )
+ self.server_type = buf.rstrip(b"\x00 ")
if self.convert_header_text:
self.server_type = self.server_type.decode(
- self.encoding or self.default_encoding)
+ self.encoding or self.default_encoding
+ )
- buf = self._read_bytes(const.os_version_number_offset + total_align,
- const.os_version_number_length)
- self.os_version = buf.rstrip(b'\x00 ')
+ buf = self._read_bytes(
+ const.os_version_number_offset + total_align, const.os_version_number_length
+ )
+ self.os_version = buf.rstrip(b"\x00 ")
if self.convert_header_text:
self.os_version = self.os_version.decode(
- self.encoding or self.default_encoding)
+ self.encoding or self.default_encoding
+ )
- buf = self._read_bytes(const.os_name_offset + total_align,
- const.os_name_length)
- buf = buf.rstrip(b'\x00 ')
+ buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length)
+ buf = buf.rstrip(b"\x00 ")
if len(buf) > 0:
self.os_name = buf.decode(self.encoding or self.default_encoding)
else:
- buf = self._read_bytes(const.os_maker_offset + total_align,
- const.os_maker_length)
- self.os_name = buf.rstrip(b'\x00 ')
+ buf = self._read_bytes(
+ const.os_maker_offset + total_align, const.os_maker_length
+ )
+ self.os_name = buf.rstrip(b"\x00 ")
if self.convert_header_text:
self.os_name = self.os_name.decode(
- self.encoding or self.default_encoding)
+ self.encoding or self.default_encoding
+ )
def __next__(self):
da = self.read(nrows=self.chunksize or 1)
@@ -281,7 +300,7 @@ def _read_bytes(self, offset, length):
if offset + length > len(self._cached_page):
self.close()
raise ValueError("The cached page is too small.")
- return self._cached_page[offset:offset + length]
+ return self._cached_page[offset : offset + length]
def _parse_metadata(self):
done = False
@@ -291,8 +310,7 @@ def _parse_metadata(self):
break
if len(self._cached_page) != self._page_length:
self.close()
- raise ValueError(
- "Failed to read a meta data page from the SAS file.")
+ raise ValueError("Failed to read a meta data page from the SAS file.")
done = self._process_page_meta()
def _process_page_meta(self):
@@ -302,43 +320,45 @@ def _process_page_meta(self):
self._process_page_metadata()
is_data_page = self._current_page_type & const.page_data_type
is_mix_page = self._current_page_type in const.page_mix_types
- return (is_data_page or is_mix_page
- or self._current_page_data_subheader_pointers != [])
+ return (
+ is_data_page
+ or is_mix_page
+ or self._current_page_data_subheader_pointers != []
+ )
def _read_page_header(self):
bit_offset = self._page_bit_offset
tx = const.page_type_offset + bit_offset
self._current_page_type = self._read_int(tx, const.page_type_length)
tx = const.block_count_offset + bit_offset
- self._current_page_block_count = self._read_int(
- tx, const.block_count_length)
+ self._current_page_block_count = self._read_int(tx, const.block_count_length)
tx = const.subheader_count_offset + bit_offset
- self._current_page_subheaders_count = (
- self._read_int(tx, const.subheader_count_length))
+ self._current_page_subheaders_count = self._read_int(
+ tx, const.subheader_count_length
+ )
def _process_page_metadata(self):
bit_offset = self._page_bit_offset
for i in range(self._current_page_subheaders_count):
pointer = self._process_subheader_pointers(
- const.subheader_pointers_offset + bit_offset, i)
+ const.subheader_pointers_offset + bit_offset, i
+ )
if pointer.length == 0:
continue
if pointer.compression == const.truncated_subheader_id:
continue
- subheader_signature = self._read_subheader_signature(
- pointer.offset)
- subheader_index = (
- self._get_subheader_index(subheader_signature,
- pointer.compression, pointer.ptype))
+ subheader_signature = self._read_subheader_signature(pointer.offset)
+ subheader_index = self._get_subheader_index(
+ subheader_signature, pointer.compression, pointer.ptype
+ )
self._process_subheader(subheader_index, pointer)
def _get_subheader_index(self, signature, compression, ptype):
index = const.subheader_signature_to_index.get(signature)
if index is None:
- f1 = ((compression == const.compressed_subheader_id) or
- (compression == 0))
- f2 = (ptype == const.compressed_subheader_type)
+ f1 = (compression == const.compressed_subheader_id) or (compression == 0)
+ f2 = ptype == const.compressed_subheader_type
if (self.compression != "") and f1 and f2:
index = const.SASIndex.data_subheader_index
else:
@@ -349,8 +369,7 @@ def _get_subheader_index(self, signature, compression, ptype):
def _process_subheader_pointers(self, offset, subheader_pointer_index):
subheader_pointer_length = self._subheader_pointer_length
- total_offset = (offset +
- subheader_pointer_length * subheader_pointer_index)
+ total_offset = offset + subheader_pointer_length * subheader_pointer_index
subheader_offset = self._read_int(total_offset, self._int_length)
total_offset += self._int_length
@@ -416,13 +435,17 @@ def _process_rowsize_subheader(self, offset, length):
lcp_offset += 378
self.row_length = self._read_int(
- offset + const.row_length_offset_multiplier * int_len, int_len)
+ offset + const.row_length_offset_multiplier * int_len, int_len
+ )
self.row_count = self._read_int(
- offset + const.row_count_offset_multiplier * int_len, int_len)
+ offset + const.row_count_offset_multiplier * int_len, int_len
+ )
self.col_count_p1 = self._read_int(
- offset + const.col_count_p1_multiplier * int_len, int_len)
+ offset + const.col_count_p1_multiplier * int_len, int_len
+ )
self.col_count_p2 = self._read_int(
- offset + const.col_count_p2_multiplier * int_len, int_len)
+ offset + const.col_count_p2_multiplier * int_len, int_len
+ )
mx = const.row_count_on_mix_page_offset_multiplier * int_len
self._mix_page_row_count = self._read_int(offset + mx, int_len)
self._lcs = self._read_int(lcs_offset, 2)
@@ -432,13 +455,15 @@ def _process_columnsize_subheader(self, offset, length):
int_len = self._int_length
offset += int_len
self.column_count = self._read_int(offset, int_len)
- if (self.col_count_p1 + self.col_count_p2 !=
- self.column_count):
+ if self.col_count_p1 + self.col_count_p2 != self.column_count:
print(
"Warning: column count mismatch ({p1} + {p2} != "
"{column_count})\n".format(
- p1=self.col_count_p1, p2=self.col_count_p2,
- column_count=self.column_count))
+ p1=self.col_count_p1,
+ p2=self.col_count_p2,
+ column_count=self.column_count,
+ )
+ )
# Unknown purpose
def _process_subheader_counts(self, offset, length):
@@ -476,60 +501,74 @@ def _process_columntext_subheader(self, offset, length):
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
- self.creator_proc = buf[0:self._lcp]
+ self.creator_proc = buf[0 : self._lcp]
elif compression_literal == const.rle_compression:
offset1 = offset + 40
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
- self.creator_proc = buf[0:self._lcp]
+ self.creator_proc = buf[0 : self._lcp]
elif self._lcs > 0:
self._lcp = 0
offset1 = offset + 16
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcs)
- self.creator_proc = buf[0:self._lcp]
+ self.creator_proc = buf[0 : self._lcp]
if self.convert_header_text:
if hasattr(self, "creator_proc"):
self.creator_proc = self.creator_proc.decode(
- self.encoding or self.default_encoding)
+ self.encoding or self.default_encoding
+ )
def _process_columnname_subheader(self, offset, length):
int_len = self._int_length
offset += int_len
column_name_pointers_count = (length - 2 * int_len - 12) // 8
for i in range(column_name_pointers_count):
- text_subheader = offset + const.column_name_pointer_length * \
- (i + 1) + const.column_name_text_subheader_offset
- col_name_offset = offset + const.column_name_pointer_length * \
- (i + 1) + const.column_name_offset_offset
- col_name_length = offset + const.column_name_pointer_length * \
- (i + 1) + const.column_name_length_offset
+ text_subheader = (
+ offset
+ + const.column_name_pointer_length * (i + 1)
+ + const.column_name_text_subheader_offset
+ )
+ col_name_offset = (
+ offset
+ + const.column_name_pointer_length * (i + 1)
+ + const.column_name_offset_offset
+ )
+ col_name_length = (
+ offset
+ + const.column_name_pointer_length * (i + 1)
+ + const.column_name_length_offset
+ )
idx = self._read_int(
- text_subheader, const.column_name_text_subheader_length)
+ text_subheader, const.column_name_text_subheader_length
+ )
col_offset = self._read_int(
- col_name_offset, const.column_name_offset_length)
- col_len = self._read_int(
- col_name_length, const.column_name_length_length)
+ col_name_offset, const.column_name_offset_length
+ )
+ col_len = self._read_int(col_name_length, const.column_name_length_length)
name_str = self.column_names_strings[idx]
- self.column_names.append(name_str[col_offset:col_offset + col_len])
+ self.column_names.append(name_str[col_offset : col_offset + col_len])
def _process_columnattributes_subheader(self, offset, length):
int_len = self._int_length
- column_attributes_vectors_count = (
- length - 2 * int_len - 12) // (int_len + 8)
+ column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
for i in range(column_attributes_vectors_count):
- col_data_offset = (offset + int_len +
- const.column_data_offset_offset +
- i * (int_len + 8))
- col_data_len = (offset + 2 * int_len +
- const.column_data_length_offset +
- i * (int_len + 8))
- col_types = (offset + 2 * int_len +
- const.column_type_offset + i * (int_len + 8))
+ col_data_offset = (
+ offset + int_len + const.column_data_offset_offset + i * (int_len + 8)
+ )
+ col_data_len = (
+ offset
+ + 2 * int_len
+ + const.column_data_length_offset
+ + i * (int_len + 8)
+ )
+ col_types = (
+ offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
+ )
x = self._read_int(col_data_offset, int_len)
self._column_data_offsets.append(x)
@@ -538,7 +577,7 @@ def _process_columnattributes_subheader(self, offset, length):
self._column_data_lengths.append(x)
x = self._read_int(col_types, const.column_type_length)
- self._column_types.append(b'd' if x == 1 else b's')
+ self._column_types.append(b"d" if x == 1 else b"s")
def _process_columnlist_subheader(self, offset, length):
# unknown purpose
@@ -547,47 +586,38 @@ def _process_columnlist_subheader(self, offset, length):
def _process_format_subheader(self, offset, length):
int_len = self._int_length
text_subheader_format = (
- offset +
- const.column_format_text_subheader_index_offset +
- 3 * int_len)
- col_format_offset = (offset +
- const.column_format_offset_offset +
- 3 * int_len)
- col_format_len = (offset +
- const.column_format_length_offset +
- 3 * int_len)
+ offset + const.column_format_text_subheader_index_offset + 3 * int_len
+ )
+ col_format_offset = offset + const.column_format_offset_offset + 3 * int_len
+ col_format_len = offset + const.column_format_length_offset + 3 * int_len
text_subheader_label = (
- offset +
- const.column_label_text_subheader_index_offset +
- 3 * int_len)
- col_label_offset = (offset +
- const.column_label_offset_offset +
- 3 * int_len)
+ offset + const.column_label_text_subheader_index_offset + 3 * int_len
+ )
+ col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
col_label_len = offset + const.column_label_length_offset + 3 * int_len
- x = self._read_int(text_subheader_format,
- const.column_format_text_subheader_index_length)
+ x = self._read_int(
+ text_subheader_format, const.column_format_text_subheader_index_length
+ )
format_idx = min(x, len(self.column_names_strings) - 1)
format_start = self._read_int(
- col_format_offset, const.column_format_offset_length)
- format_len = self._read_int(
- col_format_len, const.column_format_length_length)
+ col_format_offset, const.column_format_offset_length
+ )
+ format_len = self._read_int(col_format_len, const.column_format_length_length)
label_idx = self._read_int(
- text_subheader_label,
- const.column_label_text_subheader_index_length)
+ text_subheader_label, const.column_label_text_subheader_index_length
+ )
label_idx = min(label_idx, len(self.column_names_strings) - 1)
- label_start = self._read_int(
- col_label_offset, const.column_label_offset_length)
- label_len = self._read_int(col_label_len,
- const.column_label_length_length)
+ label_start = self._read_int(col_label_offset, const.column_label_offset_length)
+ label_len = self._read_int(col_label_len, const.column_label_length_length)
label_names = self.column_names_strings[label_idx]
- column_label = label_names[label_start: label_start + label_len]
+ column_label = label_names[label_start : label_start + label_len]
format_names = self.column_names_strings[format_idx]
- column_format = format_names[format_start: format_start + format_len]
+ column_format = format_names[format_start : format_start + format_len]
current_column_number = len(self.columns)
col = _column()
@@ -619,8 +649,8 @@ def read(self, nrows=None):
if nrows > m:
nrows = m
- nd = self._column_types.count(b'd')
- ns = self._column_types.count(b's')
+ nd = self._column_types.count(b"d")
+ ns = self._column_types.count(b"s")
self._string_chunk = np.empty((ns, nrows), dtype=np.object)
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
@@ -642,10 +672,8 @@ def _read_next_page(self):
return True
elif len(self._cached_page) != self._page_length:
self.close()
- msg = ("failed to read complete page from file "
- "(read {:d} of {:d} bytes)")
- raise ValueError(msg.format(len(self._cached_page),
- self._page_length))
+ msg = "failed to read complete page from file " "(read {:d} of {:d} bytes)"
+ raise ValueError(msg.format(len(self._cached_page), self._page_length))
self._read_page_header()
page_type = self._current_page_type
@@ -671,32 +699,34 @@ def _chunk_to_dataframe(self):
name = self.column_names[j]
- if self._column_types[j] == b'd':
- rslt[name] = self._byte_chunk[jb, :].view(
- dtype=self.byte_order + 'd')
+ if self._column_types[j] == b"d":
+ rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
if self.convert_dates:
unit = None
if self.column_formats[j] in const.sas_date_formats:
- unit = 'd'
+ unit = "d"
elif self.column_formats[j] in const.sas_datetime_formats:
- unit = 's'
+ unit = "s"
if unit:
- rslt[name] = pd.to_datetime(rslt[name], unit=unit,
- origin="1960-01-01")
+ rslt[name] = pd.to_datetime(
+ rslt[name], unit=unit, origin="1960-01-01"
+ )
jb += 1
- elif self._column_types[j] == b's':
+ elif self._column_types[j] == b"s":
rslt[name] = self._string_chunk[js, :]
if self.convert_text and (self.encoding is not None):
rslt[name] = rslt[name].str.decode(
- self.encoding or self.default_encoding)
+ self.encoding or self.default_encoding
+ )
if self.blank_missing:
ii = rslt[name].str.len() == 0
rslt.loc[ii, name] = np.nan
js += 1
else:
self.close()
- raise ValueError("unknown column type {type}".format(
- type=self._column_types[j]))
+ raise ValueError(
+ "unknown column type {type}".format(type=self._column_types[j])
+ )
return rslt
diff --git a/pandas/io/sas/sas_constants.py b/pandas/io/sas/sas_constants.py
index c37a26cd62ad2..23b23a1bf09c0 100644
--- a/pandas/io/sas/sas_constants.py
+++ b/pandas/io/sas/sas_constants.py
@@ -1,13 +1,15 @@
-magic = (b"\x00\x00\x00\x00\x00\x00\x00\x00" +
- b"\x00\x00\x00\x00\xc2\xea\x81\x60" +
- b"\xb3\x14\x11\xcf\xbd\x92\x08\x00" +
- b"\x09\xc7\x31\x8c\x18\x1f\x10\x11")
+magic = (
+ b"\x00\x00\x00\x00\x00\x00\x00\x00"
+ + b"\x00\x00\x00\x00\xc2\xea\x81\x60"
+ + b"\xb3\x14\x11\xcf\xbd\x92\x08\x00"
+ + b"\x09\xc7\x31\x8c\x18\x1f\x10\x11"
+)
-align_1_checker_value = b'3'
+align_1_checker_value = b"3"
align_1_offset = 32
align_1_length = 1
align_1_value = 4
-u64_byte_checker_value = b'3'
+u64_byte_checker_value = b"3"
align_2_offset = 35
align_2_length = 1
align_2_value = 4
@@ -91,15 +93,22 @@
column_label_offset_length = 2
column_label_length_offset = 32
column_label_length_length = 2
-rle_compression = b'SASYZCRL'
-rdc_compression = b'SASYZCR2'
+rle_compression = b"SASYZCRL"
+rdc_compression = b"SASYZCR2"
compression_literals = [rle_compression, rdc_compression]
# Incomplete list of encodings, using SAS nomenclature:
# http://support.sas.com/documentation/cdl/en/nlsref/61893/HTML/default/viewer.htm#a002607278.htm
-encoding_names = {29: "latin1", 20: "utf-8", 33: "cyrillic", 60: "wlatin2",
- 61: "wcyrillic", 62: "wlatin1", 90: "ebcdic870"}
+encoding_names = {
+ 29: "latin1",
+ 20: "utf-8",
+ 33: "cyrillic",
+ 60: "wlatin2",
+ 61: "wcyrillic",
+ 62: "wlatin1",
+ 90: "ebcdic870",
+}
class SASIndex:
@@ -144,28 +153,101 @@ class SASIndex:
b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
- b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index}
+ b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
+}
# List of frequently used SAS date and datetime formats
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
-sas_date_formats = ("DATE", "DAY", "DDMMYY", "DOWNAME", "JULDAY", "JULIAN",
- "MMDDYY", "MMYY", "MMYYC", "MMYYD", "MMYYP", "MMYYS",
- "MMYYN", "MONNAME", "MONTH", "MONYY", "QTR", "QTRR",
- "NENGO", "WEEKDATE", "WEEKDATX", "WEEKDAY", "WEEKV",
- "WORDDATE", "WORDDATX", "YEAR", "YYMM", "YYMMC", "YYMMD",
- "YYMMP", "YYMMS", "YYMMN", "YYMON", "YYMMDD", "YYQ",
- "YYQC", "YYQD", "YYQP", "YYQS", "YYQN", "YYQR", "YYQRC",
- "YYQRD", "YYQRP", "YYQRS", "YYQRN",
- "YYMMDDP", "YYMMDDC", "E8601DA", "YYMMDDN", "MMDDYYC",
- "MMDDYYS", "MMDDYYD", "YYMMDDS", "B8601DA", "DDMMYYN",
- "YYMMDDD", "DDMMYYB", "DDMMYYP", "MMDDYYP", "YYMMDDB",
- "MMDDYYN", "DDMMYYC", "DDMMYYD", "DDMMYYS",
- "MINGUO")
+sas_date_formats = (
+ "DATE",
+ "DAY",
+ "DDMMYY",
+ "DOWNAME",
+ "JULDAY",
+ "JULIAN",
+ "MMDDYY",
+ "MMYY",
+ "MMYYC",
+ "MMYYD",
+ "MMYYP",
+ "MMYYS",
+ "MMYYN",
+ "MONNAME",
+ "MONTH",
+ "MONYY",
+ "QTR",
+ "QTRR",
+ "NENGO",
+ "WEEKDATE",
+ "WEEKDATX",
+ "WEEKDAY",
+ "WEEKV",
+ "WORDDATE",
+ "WORDDATX",
+ "YEAR",
+ "YYMM",
+ "YYMMC",
+ "YYMMD",
+ "YYMMP",
+ "YYMMS",
+ "YYMMN",
+ "YYMON",
+ "YYMMDD",
+ "YYQ",
+ "YYQC",
+ "YYQD",
+ "YYQP",
+ "YYQS",
+ "YYQN",
+ "YYQR",
+ "YYQRC",
+ "YYQRD",
+ "YYQRP",
+ "YYQRS",
+ "YYQRN",
+ "YYMMDDP",
+ "YYMMDDC",
+ "E8601DA",
+ "YYMMDDN",
+ "MMDDYYC",
+ "MMDDYYS",
+ "MMDDYYD",
+ "YYMMDDS",
+ "B8601DA",
+ "DDMMYYN",
+ "YYMMDDD",
+ "DDMMYYB",
+ "DDMMYYP",
+ "MMDDYYP",
+ "YYMMDDB",
+ "MMDDYYN",
+ "DDMMYYC",
+ "DDMMYYD",
+ "DDMMYYS",
+ "MINGUO",
+)
-sas_datetime_formats = ("DATETIME", "DTWKDATX",
- "B8601DN", "B8601DT", "B8601DX", "B8601DZ", "B8601LX",
- "E8601DN", "E8601DT", "E8601DX", "E8601DZ", "E8601LX",
- "DATEAMPM", "DTDATE", "DTMONYY", "DTMONYY", "DTWKDATX",
- "DTYEAR", "TOD", "MDYAMPM")
+sas_datetime_formats = (
+ "DATETIME",
+ "DTWKDATX",
+ "B8601DN",
+ "B8601DT",
+ "B8601DX",
+ "B8601DZ",
+ "B8601LX",
+ "E8601DN",
+ "E8601DT",
+ "E8601DX",
+ "E8601DZ",
+ "E8601LX",
+ "DATEAMPM",
+ "DTDATE",
+ "DTMONYY",
+ "DTMONYY",
+ "DTWKDATX",
+ "DTYEAR",
+ "TOD",
+ "MDYAMPM",
+)
diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py
index 0dbea452230d6..34b93d72d0e29 100644
--- a/pandas/io/sas/sas_xport.py
+++ b/pandas/io/sas/sas_xport.py
@@ -21,17 +21,39 @@
from pandas.io.common import BaseIterator, get_filepath_or_buffer
-_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
- "000000000000000000000000000000 ")
-_correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!"
- "000000000000000001600000000")
-_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
- "000000000000000000000000000000 ")
-_correct_obs_header = ("HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
- "000000000000000000000000000000 ")
-_fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label',
- 'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform',
- 'nifl', 'nifd', 'npos', '_']
+_correct_line1 = (
+ "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
+ "000000000000000000000000000000 "
+)
+_correct_header1 = (
+ "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!" "000000000000000001600000000"
+)
+_correct_header2 = (
+ "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
+ "000000000000000000000000000000 "
+)
+_correct_obs_header = (
+ "HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
+ "000000000000000000000000000000 "
+)
+_fieldkeys = [
+ "ntype",
+ "nhfun",
+ "field_length",
+ "nvar0",
+ "name",
+ "label",
+ "nform",
+ "nfl",
+ "num_decimals",
+ "nfj",
+ "nfill",
+ "niform",
+ "nifl",
+ "nifd",
+ "npos",
+ "_",
+]
_base_params_doc = """\
@@ -80,10 +102,12 @@
>>> for chunk in itr:
>>> do_something(chunk)
-""" % {"_base_params_doc": _base_params_doc,
- "_format_params_doc": _format_params_doc,
- "_params2_doc": _params2_doc,
- "_iterator_doc": _iterator_doc}
+""" % {
+ "_base_params_doc": _base_params_doc,
+ "_format_params_doc": _format_params_doc,
+ "_params2_doc": _params2_doc,
+ "_iterator_doc": _iterator_doc,
+}
_xport_reader_doc = """\
@@ -98,8 +122,10 @@
Contains information about the file
fields : list
Contains information about the variables in the file
-""" % {"_base_params_doc": _base_params_doc,
- "_params2_doc": _params2_doc}
+""" % {
+ "_base_params_doc": _base_params_doc,
+ "_params2_doc": _params2_doc,
+}
_read_method_doc = """\
@@ -142,9 +168,9 @@ def _split_line(s, parts):
out = {}
start = 0
for name, length in parts:
- out[name] = s[start:start + length].strip()
+ out[name] = s[start : start + length].strip()
start += length
- del out['_']
+ del out["_"]
return out
@@ -158,10 +184,10 @@ def _handle_truncated_float_vec(vec, nbytes):
# The R "foreign" library
if nbytes != 8:
- vec1 = np.zeros(len(vec), np.dtype('S8'))
- dtype = np.dtype('S%d,S%d' % (nbytes, 8 - nbytes))
+ vec1 = np.zeros(len(vec), np.dtype("S8"))
+ dtype = np.dtype("S%d,S%d" % (nbytes, 8 - nbytes))
vec2 = vec1.view(dtype=dtype)
- vec2['f0'] = vec
+ vec2["f0"] = vec
return vec2
return vec
@@ -173,14 +199,14 @@ def _parse_float_vec(vec):
native 8 byte floats.
"""
- dtype = np.dtype('>u4,>u4')
+ dtype = np.dtype(">u4,>u4")
vec1 = vec.view(dtype=dtype)
- xport1 = vec1['f0']
- xport2 = vec1['f1']
+ xport1 = vec1["f0"]
+ xport2 = vec1["f1"]
# Start by setting first half of ieee number to first half of IBM
# number sans exponent
- ieee1 = xport1 & 0x00ffffff
+ ieee1 = xport1 & 0x00FFFFFF
# The fraction bit to the left of the binary point in the ieee
# format was set and the number was shifted 0, 1, 2, or 3
@@ -203,7 +229,7 @@ def _parse_float_vec(vec):
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
# clear the 1 bit to the left of the binary point
- ieee1 &= 0xffefffff
+ ieee1 &= 0xFFEFFFFF
# set the exponent of the ieee number to be the actual exponent
# plus the shift count + 1023. Or this into the first half of the
@@ -212,14 +238,15 @@ def _parse_float_vec(vec):
# incremented by 1 and the fraction bits left 4 positions to the
# right of the radix point. (had to add >> 24 because C treats &
# 0x7f as 0x7f000000 and Python doesn't)
- ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) +
- shift + 1023) << 20) | (xport1 & 0x80000000)
+ ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | (
+ xport1 & 0x80000000
+ )
- ieee = np.empty((len(ieee1),), dtype='>u4,>u4')
- ieee['f0'] = ieee1
- ieee['f1'] = ieee2
- ieee = ieee.view(dtype='>f8')
- ieee = ieee.astype('f8')
+ ieee = np.empty((len(ieee1),), dtype=">u4,>u4")
+ ieee["f0"] = ieee1
+ ieee["f1"] = ieee2
+ ieee = ieee.view(dtype=">f8")
+ ieee = ieee.astype("f8")
return ieee
@@ -227,8 +254,9 @@ def _parse_float_vec(vec):
class XportReader(BaseIterator):
__doc__ = _xport_reader_doc
- def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
- chunksize=None):
+ def __init__(
+ self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None
+ ):
self._encoding = encoding
self._lines_read = 0
@@ -236,12 +264,15 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1',
self._chunksize = chunksize
if isinstance(filepath_or_buffer, str):
- (filepath_or_buffer, encoding,
- compression, should_close) = get_filepath_or_buffer(
- filepath_or_buffer, encoding=encoding)
+ (
+ filepath_or_buffer,
+ encoding,
+ compression,
+ should_close,
+ ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding)
if isinstance(filepath_or_buffer, (str, bytes)):
- self.filepath_or_buffer = open(filepath_or_buffer, 'rb')
+ self.filepath_or_buffer = open(filepath_or_buffer, "rb")
else:
# Copy to BytesIO, and ensure no encoding
contents = filepath_or_buffer.read()
@@ -269,23 +300,22 @@ def _read_header(self):
raise ValueError("Header record is not an XPORT file.")
line2 = self._get_row()
- fif = [['prefix', 24], ['version', 8], ['OS', 8],
- ['_', 24], ['created', 16]]
+ fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]]
file_info = _split_line(line2, fif)
- if file_info['prefix'] != "SAS SAS SASLIB":
+ if file_info["prefix"] != "SAS SAS SASLIB":
self.close()
raise ValueError("Header record has invalid prefix.")
- file_info['created'] = _parse_date(file_info['created'])
+ file_info["created"] = _parse_date(file_info["created"])
self.file_info = file_info
line3 = self._get_row()
- file_info['modified'] = _parse_date(line3[:16])
+ file_info["modified"] = _parse_date(line3[:16])
# read member header
header1 = self._get_row()
header2 = self._get_row()
headflag1 = header1.startswith(_correct_header1)
- headflag2 = (header2 == _correct_header2)
+ headflag2 = header2 == _correct_header2
if not (headflag1 and headflag2):
self.close()
raise ValueError("Member header not found")
@@ -293,17 +323,24 @@ def _read_header(self):
fieldnamelength = int(header1[-5:-2])
# member info
- mem = [['prefix', 8], ['set_name', 8], ['sasdata', 8],
- ['version', 8], ['OS', 8], ['_', 24], ['created', 16]]
+ mem = [
+ ["prefix", 8],
+ ["set_name", 8],
+ ["sasdata", 8],
+ ["version", 8],
+ ["OS", 8],
+ ["_", 24],
+ ["created", 16],
+ ]
member_info = _split_line(self._get_row(), mem)
- mem = [['modified', 16], ['_', 16], ['label', 40], ['type', 8]]
+ mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]]
member_info.update(_split_line(self._get_row(), mem))
- member_info['modified'] = _parse_date(member_info['modified'])
- member_info['created'] = _parse_date(member_info['created'])
+ member_info["modified"] = _parse_date(member_info["modified"])
+ member_info["created"] = _parse_date(member_info["created"])
self.member_info = member_info
# read field names
- types = {1: 'numeric', 2: 'char'}
+ types = {1: "numeric", 2: "char"}
fieldcount = int(self._get_row()[54:58])
datalength = fieldnamelength * fieldcount
# round up to nearest 80
@@ -314,19 +351,21 @@ def _read_header(self):
obs_length = 0
while len(fielddata) >= fieldnamelength:
# pull data for one field
- field, fielddata = (fielddata[:fieldnamelength],
- fielddata[fieldnamelength:])
+ field, fielddata = (
+ fielddata[:fieldnamelength],
+ fielddata[fieldnamelength:],
+ )
# rest at end gets ignored, so if field is short, pad out
# to match struct pattern below
field = field.ljust(140)
- fieldstruct = struct.unpack('>hhhh8s40s8shhh2s8shhl52s', field)
+ fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", field)
field = dict(zip(_fieldkeys, fieldstruct))
- del field['_']
- field['ntype'] = types[field['ntype']]
- fl = field['field_length']
- if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)):
+ del field["_"]
+ field["ntype"] = types[field["ntype"]]
+ fl = field["field_length"]
+ if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)):
self.close()
msg = "Floating field width {0} is not between 2 and 8."
raise TypeError(msg.format(fl))
@@ -337,7 +376,7 @@ def _read_header(self):
except AttributeError:
pass
- obs_length += field['field_length']
+ obs_length += field["field_length"]
fields += [field]
header = self._get_row()
@@ -350,11 +389,13 @@ def _read_header(self):
self.record_start = self.filepath_or_buffer.tell()
self.nobs = self._record_count()
- self.columns = [x['name'].decode() for x in self.fields]
+ self.columns = [x["name"].decode() for x in self.fields]
# Setup the dtype.
- dtypel = [('s' + str(i), "S" + str(field['field_length']))
- for i, field in enumerate(self.fields)]
+ dtypel = [
+ ("s" + str(i), "S" + str(field["field_length"]))
+ for i, field in enumerate(self.fields)
+ ]
dtype = np.dtype(dtypel)
self._dtype = dtype
@@ -372,8 +413,7 @@ def _record_count(self):
"""
self.filepath_or_buffer.seek(0, 2)
- total_records_length = (self.filepath_or_buffer.tell() -
- self.record_start)
+ total_records_length = self.filepath_or_buffer.tell() - self.record_start
if total_records_length % 80 != 0:
warnings.warn("xport file may be corrupted")
@@ -416,10 +456,13 @@ def get_chunk(self, size=None):
return self.read(nrows=size)
def _missing_double(self, vec):
- v = vec.view(dtype='u1,u1,u2,u4')
- miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0)
- miss1 = (((v['f0'] >= 0x41) & (v['f0'] <= 0x5a)) |
- (v['f0'] == 0x5f) | (v['f0'] == 0x2e))
+ v = vec.view(dtype="u1,u1,u2,u4")
+ miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0)
+ miss1 = (
+ ((v["f0"] >= 0x41) & (v["f0"] <= 0x5A))
+ | (v["f0"] == 0x5F)
+ | (v["f0"] == 0x2E)
+ )
miss &= miss1
return miss
@@ -439,15 +482,14 @@ def read(self, nrows=None):
df = pd.DataFrame(index=range(read_lines))
for j, x in enumerate(self.columns):
- vec = data['s%d' % j]
- ntype = self.fields[j]['ntype']
+ vec = data["s%d" % j]
+ ntype = self.fields[j]["ntype"]
if ntype == "numeric":
- vec = _handle_truncated_float_vec(
- vec, self.fields[j]['field_length'])
+ vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"])
miss = self._missing_double(vec)
v = _parse_float_vec(vec)
v[miss] = np.nan
- elif self.fields[j]['ntype'] == 'char':
+ elif self.fields[j]["ntype"] == "char":
v = [y.rstrip() for y in vec]
if self._encoding is not None:
diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py
index 0726e17e3bbab..680425f421eec 100644
--- a/pandas/io/sas/sasreader.py
+++ b/pandas/io/sas/sasreader.py
@@ -4,8 +4,14 @@
from pandas.io.common import _stringify_path
-def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
- chunksize=None, iterator=False):
+def read_sas(
+ filepath_or_buffer,
+ format=None,
+ index=None,
+ encoding=None,
+ chunksize=None,
+ iterator=False,
+):
"""
Read SAS files stored as either XPORT or SAS7BDAT format files.
@@ -31,9 +37,11 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
or XportReader
"""
if format is None:
- buffer_error_msg = ("If this is a buffer object rather "
- "than a string name, you must specify "
- "a format string")
+ buffer_error_msg = (
+ "If this is a buffer object rather "
+ "than a string name, you must specify "
+ "a format string"
+ )
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, str):
raise ValueError(buffer_error_msg)
@@ -45,18 +53,20 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
else:
raise ValueError("unable to infer format of SAS file")
- if format.lower() == 'xport':
+ if format.lower() == "xport":
from pandas.io.sas.sas_xport import XportReader
- reader = XportReader(filepath_or_buffer, index=index,
- encoding=encoding,
- chunksize=chunksize)
- elif format.lower() == 'sas7bdat':
+
+ reader = XportReader(
+ filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
+ )
+ elif format.lower() == "sas7bdat":
from pandas.io.sas.sas7bdat import SAS7BDATReader
- reader = SAS7BDATReader(filepath_or_buffer, index=index,
- encoding=encoding,
- chunksize=chunksize)
+
+ reader = SAS7BDATReader(
+ filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
+ )
else:
- raise ValueError('unknown SAS format')
+ raise ValueError("unknown SAS format")
if iterator or chunksize:
return reader
diff --git a/pandas/io/spss.py b/pandas/io/spss.py
index b1b92fc2b8439..983ac1c818c42 100644
--- a/pandas/io/spss.py
+++ b/pandas/io/spss.py
@@ -7,9 +7,11 @@
from pandas.core.api import DataFrame
-def read_spss(path: Union[str, Path],
- usecols: Optional[Sequence[str]] = None,
- convert_categoricals: bool = True) -> DataFrame:
+def read_spss(
+ path: Union[str, Path],
+ usecols: Optional[Sequence[str]] = None,
+ convert_categoricals: bool = True,
+) -> DataFrame:
"""
Load an SPSS file from the file path, returning a DataFrame.
@@ -36,6 +38,7 @@ def read_spss(path: Union[str, Path],
else:
usecols = list(usecols) # pyreadstat requires a list
- df, _ = pyreadstat.read_sav(path, usecols=usecols,
- apply_value_formats=convert_categoricals)
+ df, _ = pyreadstat.read_sav(
+ path, usecols=usecols, apply_value_formats=convert_categoricals
+ )
return df
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 6cb57077be76a..211571c7dbaa1 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -14,8 +14,7 @@
import pandas._libs.lib as lib
from pandas.compat import raise_with_traceback
-from pandas.core.dtypes.common import (
- is_datetime64tz_dtype, is_dict_like, is_list_like)
+from pandas.core.dtypes.common import is_datetime64tz_dtype, is_dict_like, is_list_like
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.missing import isna
@@ -43,12 +42,14 @@ def _is_sqlalchemy_connectable(con):
if _SQLALCHEMY_INSTALLED is None:
try:
import sqlalchemy
+
_SQLALCHEMY_INSTALLED = True
except ImportError:
_SQLALCHEMY_INSTALLED = False
if _SQLALCHEMY_INSTALLED:
import sqlalchemy # noqa: F811
+
return isinstance(con, sqlalchemy.engine.Connectable)
else:
return False
@@ -58,7 +59,7 @@ def _convert_params(sql, params):
"""Convert SQL and params args to DBAPI2.0 compliant format."""
args = [sql]
if params is not None:
- if hasattr(params, 'keys'): # test if params is a mapping
+ if hasattr(params, "keys"): # test if params is a mapping
args += [params]
else:
args += [list(params)]
@@ -71,28 +72,30 @@ def _process_parse_dates_argument(parse_dates):
if parse_dates is True or parse_dates is None or parse_dates is False:
parse_dates = []
- elif not hasattr(parse_dates, '__iter__'):
+ elif not hasattr(parse_dates, "__iter__"):
parse_dates = [parse_dates]
return parse_dates
def _handle_date_column(col, utc=None, format=None):
if isinstance(format, dict):
- return to_datetime(col, errors='ignore', **format)
+ return to_datetime(col, errors="ignore", **format)
else:
# Allow passing of formatting string for integers
# GH17855
- if format is None and (issubclass(col.dtype.type, np.floating) or
- issubclass(col.dtype.type, np.integer)):
- format = 's'
- if format in ['D', 'd', 'h', 'm', 's', 'ms', 'us', 'ns']:
- return to_datetime(col, errors='coerce', unit=format, utc=utc)
+ if format is None and (
+ issubclass(col.dtype.type, np.floating)
+ or issubclass(col.dtype.type, np.integer)
+ ):
+ format = "s"
+ if format in ["D", "d", "h", "m", "s", "ms", "us", "ns"]:
+ return to_datetime(col, errors="coerce", unit=format, utc=utc)
elif is_datetime64tz_dtype(col):
# coerce to UTC timezone
# GH11216
return to_datetime(col, utc=True)
else:
- return to_datetime(col, errors='coerce', format=format, utc=utc)
+ return to_datetime(col, errors="coerce", format=format, utc=utc)
def _parse_date_columns(data_frame, parse_dates):
@@ -116,12 +119,10 @@ def _parse_date_columns(data_frame, parse_dates):
return data_frame
-def _wrap_result(data, columns, index_col=None, coerce_float=True,
- parse_dates=None):
+def _wrap_result(data, columns, index_col=None, coerce_float=True, parse_dates=None):
"""Wrap result set of query in a DataFrame."""
- frame = DataFrame.from_records(data, columns=columns,
- coerce_float=coerce_float)
+ frame = DataFrame.from_records(data, columns=columns, coerce_float=coerce_float)
frame = _parse_date_columns(frame, parse_dates)
@@ -162,9 +163,17 @@ def execute(sql, con, cur=None, params=None):
# -----------------------------------------------------------------------------
# -- Read and write to DataFrames
-def read_sql_table(table_name, con, schema=None, index_col=None,
- coerce_float=True, parse_dates=None, columns=None,
- chunksize=None):
+
+def read_sql_table(
+ table_name,
+ con,
+ schema=None,
+ index_col=None,
+ coerce_float=True,
+ parse_dates=None,
+ columns=None,
+ chunksize=None,
+):
"""
Read SQL database table into a DataFrame.
@@ -223,10 +232,12 @@ def read_sql_table(table_name, con, schema=None, index_col=None,
con = _engine_builder(con)
if not _is_sqlalchemy_connectable(con):
- raise NotImplementedError("read_sql_table only supported for "
- "SQLAlchemy connectable.")
+ raise NotImplementedError(
+ "read_sql_table only supported for " "SQLAlchemy connectable."
+ )
import sqlalchemy
from sqlalchemy.schema import MetaData
+
meta = MetaData(con, schema=schema)
try:
meta.reflect(only=[table_name], views=True)
@@ -235,8 +246,13 @@ def read_sql_table(table_name, con, schema=None, index_col=None,
pandas_sql = SQLDatabase(con, meta=meta)
table = pandas_sql.read_table(
- table_name, index_col=index_col, coerce_float=coerce_float,
- parse_dates=parse_dates, columns=columns, chunksize=chunksize)
+ table_name,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ columns=columns,
+ chunksize=chunksize,
+ )
if table is not None:
return table
@@ -244,8 +260,15 @@ def read_sql_table(table_name, con, schema=None, index_col=None,
raise ValueError("Table {name} not found".format(name=table_name), con)
-def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None,
- parse_dates=None, chunksize=None):
+def read_sql_query(
+ sql,
+ con,
+ index_col=None,
+ coerce_float=True,
+ params=None,
+ parse_dates=None,
+ chunksize=None,
+):
"""Read SQL query into a DataFrame.
Returns a DataFrame corresponding to the result set of the query
@@ -301,12 +324,25 @@ def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None,
"""
pandas_sql = pandasSQL_builder(con)
return pandas_sql.read_query(
- sql, index_col=index_col, params=params, coerce_float=coerce_float,
- parse_dates=parse_dates, chunksize=chunksize)
-
-
-def read_sql(sql, con, index_col=None, coerce_float=True, params=None,
- parse_dates=None, columns=None, chunksize=None):
+ sql,
+ index_col=index_col,
+ params=params,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ chunksize=chunksize,
+ )
+
+
+def read_sql(
+ sql,
+ con,
+ index_col=None,
+ coerce_float=True,
+ params=None,
+ parse_dates=None,
+ columns=None,
+ chunksize=None,
+):
"""
Read SQL query or database table into a DataFrame.
@@ -366,9 +402,13 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None,
if isinstance(pandas_sql, SQLiteDatabase):
return pandas_sql.read_query(
- sql, index_col=index_col, params=params,
- coerce_float=coerce_float, parse_dates=parse_dates,
- chunksize=chunksize)
+ sql,
+ index_col=index_col,
+ params=params,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ chunksize=chunksize,
+ )
try:
_is_table_name = pandas_sql.has_table(sql)
@@ -379,17 +419,36 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None,
if _is_table_name:
pandas_sql.meta.reflect(only=[sql])
return pandas_sql.read_table(
- sql, index_col=index_col, coerce_float=coerce_float,
- parse_dates=parse_dates, columns=columns, chunksize=chunksize)
+ sql,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ columns=columns,
+ chunksize=chunksize,
+ )
else:
return pandas_sql.read_query(
- sql, index_col=index_col, params=params,
- coerce_float=coerce_float, parse_dates=parse_dates,
- chunksize=chunksize)
+ sql,
+ index_col=index_col,
+ params=params,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ chunksize=chunksize,
+ )
-def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
- index_label=None, chunksize=None, dtype=None, method=None):
+def to_sql(
+ frame,
+ name,
+ con,
+ schema=None,
+ if_exists="fail",
+ index=True,
+ index_label=None,
+ chunksize=None,
+ dtype=None,
+ method=None,
+):
"""
Write records stored in a DataFrame to a SQL database.
@@ -435,7 +494,7 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
.. versionadded:: 0.24.0
"""
- if if_exists not in ('fail', 'replace', 'append'):
+ if if_exists not in ("fail", "replace", "append"):
raise ValueError("'{0}' is not valid for if_exists".format(if_exists))
pandas_sql = pandasSQL_builder(con, schema=schema)
@@ -443,12 +502,21 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
if isinstance(frame, Series):
frame = frame.to_frame()
elif not isinstance(frame, DataFrame):
- raise NotImplementedError("'frame' argument should be either a "
- "Series or a DataFrame")
+ raise NotImplementedError(
+ "'frame' argument should be either a " "Series or a DataFrame"
+ )
- pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index,
- index_label=index_label, schema=schema,
- chunksize=chunksize, dtype=dtype, method=method)
+ pandas_sql.to_sql(
+ frame,
+ name,
+ if_exists=if_exists,
+ index=index,
+ index_label=index_label,
+ schema=schema,
+ chunksize=chunksize,
+ dtype=dtype,
+ method=method,
+ )
def has_table(table_name, con, schema=None):
@@ -496,8 +564,7 @@ def _engine_builder(con):
return con
-def pandasSQL_builder(con, schema=None, meta=None,
- is_cursor=False):
+def pandasSQL_builder(con, schema=None, meta=None, is_cursor=False):
"""
Convenience function to return the correct PandasSQL subclass based on the
provided parameters.
@@ -521,11 +588,22 @@ class SQLTable(PandasObject):
Also holds various flags needed to avoid having to
pass them between functions all the time.
"""
+
# TODO: support for multiIndex
- def __init__(self, name, pandas_sql_engine, frame=None, index=True,
- if_exists='fail', prefix='pandas', index_label=None,
- schema=None, keys=None, dtype=None):
+ def __init__(
+ self,
+ name,
+ pandas_sql_engine,
+ frame=None,
+ index=True,
+ if_exists="fail",
+ prefix="pandas",
+ index_label=None,
+ schema=None,
+ keys=None,
+ dtype=None,
+ ):
self.name = name
self.pd_sql = pandas_sql_engine
self.prefix = prefix
@@ -544,14 +622,14 @@ def __init__(self, name, pandas_sql_engine, frame=None, index=True,
self.table = self.pd_sql.get_table(self.name, self.schema)
if self.table is None:
- raise ValueError(
- "Could not init table '{name}'".format(name=name))
+ raise ValueError("Could not init table '{name}'".format(name=name))
def exists(self):
return self.pd_sql.has_table(self.name, self.schema)
def sql_schema(self):
from sqlalchemy.schema import CreateTable
+
return str(CreateTable(self.table).compile(self.pd_sql.connectable))
def _execute_create(self):
@@ -561,17 +639,19 @@ def _execute_create(self):
def create(self):
if self.exists():
- if self.if_exists == 'fail':
+ if self.if_exists == "fail":
raise ValueError(
- "Table '{name}' already exists.".format(name=self.name))
- elif self.if_exists == 'replace':
+ "Table '{name}' already exists.".format(name=self.name)
+ )
+ elif self.if_exists == "replace":
self.pd_sql.drop_table(self.name, self.schema)
self._execute_create()
- elif self.if_exists == 'append':
+ elif self.if_exists == "append":
pass
else:
raise ValueError(
- "'{0}' is not valid for if_exists".format(self.if_exists))
+ "'{0}' is not valid for if_exists".format(self.if_exists)
+ )
else:
self._execute_create()
@@ -606,8 +686,7 @@ def insert_data(self):
try:
temp.reset_index(inplace=True)
except ValueError as err:
- raise ValueError(
- "duplicate name in index/columns: {0}".format(err))
+ raise ValueError("duplicate name in index/columns: {0}".format(err))
else:
temp = self.frame
@@ -626,7 +705,7 @@ def insert_data(self):
d = np.atleast_2d(d)
else:
# convert to microsecond resolution for datetime.datetime
- d = b.values.astype('M8[us]').astype(object)
+ d = b.values.astype("M8[us]").astype(object)
else:
d = np.array(b.get_values(), dtype=object)
@@ -645,12 +724,12 @@ def insert(self, chunksize=None, method=None):
# set insert method
if method is None:
exec_insert = self._execute_insert
- elif method == 'multi':
+ elif method == "multi":
exec_insert = self._execute_insert_multi
elif callable(method):
exec_insert = partial(method, self)
else:
- raise ValueError('Invalid parameter `method`: {}'.format(method))
+ raise ValueError("Invalid parameter `method`: {}".format(method))
keys, data_list = self.insert_data()
@@ -662,7 +741,7 @@ def insert(self, chunksize=None, method=None):
if chunksize is None:
chunksize = nrows
elif chunksize == 0:
- raise ValueError('chunksize argument should be non-zero')
+ raise ValueError("chunksize argument should be non-zero")
chunks = int(nrows / chunksize) + 1
@@ -676,8 +755,9 @@ def insert(self, chunksize=None, method=None):
chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list])
exec_insert(conn, keys, chunk_iter)
- def _query_iterator(self, result, chunksize, columns, coerce_float=True,
- parse_dates=None):
+ def _query_iterator(
+ self, result, chunksize, columns, coerce_float=True, parse_dates=None
+ ):
"""Return generator through chunked result set."""
while True:
@@ -686,7 +766,8 @@ def _query_iterator(self, result, chunksize, columns, coerce_float=True,
break
else:
self.frame = DataFrame.from_records(
- data, columns=columns, coerce_float=coerce_float)
+ data, columns=columns, coerce_float=coerce_float
+ )
self._harmonize_columns(parse_dates=parse_dates)
@@ -695,11 +776,11 @@ def _query_iterator(self, result, chunksize, columns, coerce_float=True,
yield self.frame
- def read(self, coerce_float=True, parse_dates=None, columns=None,
- chunksize=None):
+ def read(self, coerce_float=True, parse_dates=None, columns=None, chunksize=None):
if columns is not None and len(columns) > 0:
from sqlalchemy import select
+
cols = [self.table.c[n] for n in columns]
if self.index is not None:
[cols.insert(0, self.table.c[idx]) for idx in self.index[::-1]]
@@ -711,13 +792,18 @@ def read(self, coerce_float=True, parse_dates=None, columns=None,
column_names = result.keys()
if chunksize is not None:
- return self._query_iterator(result, chunksize, column_names,
- coerce_float=coerce_float,
- parse_dates=parse_dates)
+ return self._query_iterator(
+ result,
+ chunksize,
+ column_names,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ )
else:
data = result.fetchall()
self.frame = DataFrame.from_records(
- data, columns=column_names, coerce_float=coerce_float)
+ data, columns=column_names, coerce_float=coerce_float
+ )
self._harmonize_columns(parse_dates=parse_dates)
@@ -737,16 +823,22 @@ def _index_name(self, index, index_label):
if len(index_label) != nlevels:
raise ValueError(
"Length of 'index_label' should match number of "
- "levels, which is {0}".format(nlevels))
+ "levels, which is {0}".format(nlevels)
+ )
else:
return index_label
# return the used column labels for the index columns
- if (nlevels == 1 and 'index' not in self.frame.columns and
- self.frame.index.name is None):
- return ['index']
+ if (
+ nlevels == 1
+ and "index" not in self.frame.columns
+ and self.frame.index.name is None
+ ):
+ return ["index"]
else:
- return [l if l is not None else "level_{0}".format(i)
- for i, l in enumerate(self.frame.index.names)]
+ return [
+ l if l is not None else "level_{0}".format(i)
+ for i, l in enumerate(self.frame.index.names)
+ ]
# for reading: index=(list of) string to specify column to set as index
elif isinstance(index, str):
@@ -760,14 +852,11 @@ def _get_column_names_and_types(self, dtype_mapper):
column_names_and_types = []
if self.index is not None:
for i, idx_label in enumerate(self.index):
- idx_type = dtype_mapper(
- self.frame.index._get_level_values(i))
+ idx_type = dtype_mapper(self.frame.index._get_level_values(i))
column_names_and_types.append((str(idx_label), idx_type, True))
column_names_and_types += [
- (str(self.frame.columns[i]),
- dtype_mapper(self.frame.iloc[:, i]),
- False)
+ (str(self.frame.columns[i]), dtype_mapper(self.frame.iloc[:, i]), False)
for i in range(len(self.frame.columns))
]
@@ -776,19 +865,19 @@ def _get_column_names_and_types(self, dtype_mapper):
def _create_table_setup(self):
from sqlalchemy import Table, Column, PrimaryKeyConstraint
- column_names_and_types = self._get_column_names_and_types(
- self._sqlalchemy_type
- )
+ column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type)
- columns = [Column(name, typ, index=is_index)
- for name, typ, is_index in column_names_and_types]
+ columns = [
+ Column(name, typ, index=is_index)
+ for name, typ, is_index in column_names_and_types
+ ]
if self.keys is not None:
if not is_list_like(self.keys):
keys = [self.keys]
else:
keys = self.keys
- pkc = PrimaryKeyConstraint(*keys, name=self.name + '_pk')
+ pkc = PrimaryKeyConstraint(*keys, name=self.name + "_pk")
columns.append(pkc)
schema = self.schema or self.pd_sql.meta.schema
@@ -796,6 +885,7 @@ def _create_table_setup(self):
# At this point, attach to new metadata, only attach to self.meta
# once table is created.
from sqlalchemy.schema import MetaData
+
meta = MetaData(self.pd_sql, schema=schema)
return Table(self.name, meta, *columns, schema=schema)
@@ -826,15 +916,17 @@ def _harmonize_columns(self, parse_dates=None):
fmt = parse_dates[col_name]
except TypeError:
fmt = None
- self.frame[col_name] = _handle_date_column(
- df_col, format=fmt)
+ self.frame[col_name] = _handle_date_column(df_col, format=fmt)
continue
# the type the dataframe column should have
col_type = self._get_dtype(sql_col.type)
- if (col_type is datetime or col_type is date or
- col_type is DatetimeTZDtype):
+ if (
+ col_type is datetime
+ or col_type is date
+ or col_type is DatetimeTZDtype
+ ):
# Convert tz-aware Datetime SQL columns to UTC
utc = col_type is DatetimeTZDtype
self.frame[col_name] = _handle_date_column(df_col, utc=utc)
@@ -844,9 +936,8 @@ def _harmonize_columns(self, parse_dates=None):
elif len(df_col) == df_col.count():
# No NA values, can convert ints and bools
- if col_type is np.dtype('int64') or col_type is bool:
- self.frame[col_name] = df_col.astype(
- col_type, copy=False)
+ if col_type is np.dtype("int64") or col_type is bool:
+ self.frame[col_name] = df_col.astype(col_type, copy=False)
except KeyError:
pass # this column not in results
@@ -860,11 +951,19 @@ def _sqlalchemy_type(self, col):
# Needed for inserting typed data containing NULLs, GH 8778.
col_type = lib.infer_dtype(col, skipna=True)
- from sqlalchemy.types import (BigInteger, Integer, Float,
- Text, Boolean,
- DateTime, Date, Time, TIMESTAMP)
+ from sqlalchemy.types import (
+ BigInteger,
+ Integer,
+ Float,
+ Text,
+ Boolean,
+ DateTime,
+ Date,
+ Time,
+ TIMESTAMP,
+ )
- if col_type == 'datetime64' or col_type == 'datetime':
+ if col_type == "datetime64" or col_type == "datetime":
# GH 9086: TIMESTAMP is the suggested type if the column contains
# timezone information
try:
@@ -875,41 +974,44 @@ def _sqlalchemy_type(self, col):
if col.tz is not None:
return TIMESTAMP(timezone=True)
return DateTime
- if col_type == 'timedelta64':
- warnings.warn("the 'timedelta' type is not supported, and will be "
- "written as integer values (ns frequency) to the "
- "database.", UserWarning, stacklevel=8)
+ if col_type == "timedelta64":
+ warnings.warn(
+ "the 'timedelta' type is not supported, and will be "
+ "written as integer values (ns frequency) to the "
+ "database.",
+ UserWarning,
+ stacklevel=8,
+ )
return BigInteger
- elif col_type == 'floating':
- if col.dtype == 'float32':
+ elif col_type == "floating":
+ if col.dtype == "float32":
return Float(precision=23)
else:
return Float(precision=53)
- elif col_type == 'integer':
- if col.dtype == 'int32':
+ elif col_type == "integer":
+ if col.dtype == "int32":
return Integer
else:
return BigInteger
- elif col_type == 'boolean':
+ elif col_type == "boolean":
return Boolean
- elif col_type == 'date':
+ elif col_type == "date":
return Date
- elif col_type == 'time':
+ elif col_type == "time":
return Time
- elif col_type == 'complex':
- raise ValueError('Complex datatypes not supported')
+ elif col_type == "complex":
+ raise ValueError("Complex datatypes not supported")
return Text
def _get_dtype(self, sqltype):
- from sqlalchemy.types import (Integer, Float, Boolean, DateTime,
- Date, TIMESTAMP)
+ from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP
if isinstance(sqltype, Float):
return float
elif isinstance(sqltype, Integer):
# TODO: Refine integer size.
- return np.dtype('int64')
+ return np.dtype("int64")
elif isinstance(sqltype, TIMESTAMP):
# we have a timezone capable type
if not sqltype.timezone:
@@ -931,12 +1033,16 @@ class PandasSQL(PandasObject):
"""
def read_sql(self, *args, **kwargs):
- raise ValueError("PandasSQL must be created with an SQLAlchemy "
- "connectable or sqlite connection")
+ raise ValueError(
+ "PandasSQL must be created with an SQLAlchemy "
+ "connectable or sqlite connection"
+ )
def to_sql(self, *args, **kwargs):
- raise ValueError("PandasSQL must be created with an SQLAlchemy "
- "connectable or sqlite connection")
+ raise ValueError(
+ "PandasSQL must be created with an SQLAlchemy "
+ "connectable or sqlite connection"
+ )
class SQLDatabase(PandasSQL):
@@ -963,6 +1069,7 @@ def __init__(self, engine, schema=None, meta=None):
self.connectable = engine
if not meta:
from sqlalchemy.schema import MetaData
+
meta = MetaData(self.connectable, schema=schema)
self.meta = meta
@@ -970,7 +1077,7 @@ def __init__(self, engine, schema=None, meta=None):
@contextmanager
def run_transaction(self):
with self.connectable.begin() as tx:
- if hasattr(tx, 'execute'):
+ if hasattr(tx, "execute"):
yield tx
else:
yield self.connectable
@@ -979,9 +1086,16 @@ def execute(self, *args, **kwargs):
"""Simple passthrough to SQLAlchemy connectable"""
return self.connectable.execute(*args, **kwargs)
- def read_table(self, table_name, index_col=None, coerce_float=True,
- parse_dates=None, columns=None, schema=None,
- chunksize=None):
+ def read_table(
+ self,
+ table_name,
+ index_col=None,
+ coerce_float=True,
+ parse_dates=None,
+ columns=None,
+ schema=None,
+ chunksize=None,
+ ):
"""Read SQL database table into a DataFrame.
Parameters
@@ -1024,13 +1138,17 @@ def read_table(self, table_name, index_col=None, coerce_float=True,
"""
table = SQLTable(table_name, self, index=index_col, schema=schema)
- return table.read(coerce_float=coerce_float,
- parse_dates=parse_dates, columns=columns,
- chunksize=chunksize)
+ return table.read(
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ columns=columns,
+ chunksize=chunksize,
+ )
@staticmethod
- def _query_iterator(result, chunksize, columns, index_col=None,
- coerce_float=True, parse_dates=None):
+ def _query_iterator(
+ result, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None
+ ):
"""Return generator through chunked result set"""
while True:
@@ -1038,12 +1156,23 @@ def _query_iterator(result, chunksize, columns, index_col=None,
if not data:
break
else:
- yield _wrap_result(data, columns, index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates)
+ yield _wrap_result(
+ data,
+ columns,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ )
- def read_query(self, sql, index_col=None, coerce_float=True,
- parse_dates=None, params=None, chunksize=None):
+ def read_query(
+ self,
+ sql,
+ index_col=None,
+ coerce_float=True,
+ parse_dates=None,
+ params=None,
+ chunksize=None,
+ ):
"""Read SQL query into a DataFrame.
Parameters
@@ -1090,22 +1219,39 @@ def read_query(self, sql, index_col=None, coerce_float=True,
columns = result.keys()
if chunksize is not None:
- return self._query_iterator(result, chunksize, columns,
- index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates)
+ return self._query_iterator(
+ result,
+ chunksize,
+ columns,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ )
else:
data = result.fetchall()
- frame = _wrap_result(data, columns, index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates)
+ frame = _wrap_result(
+ data,
+ columns,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ )
return frame
read_sql = read_query
- def to_sql(self, frame, name, if_exists='fail', index=True,
- index_label=None, schema=None, chunksize=None, dtype=None,
- method=None):
+ def to_sql(
+ self,
+ frame,
+ name,
+ if_exists="fail",
+ index=True,
+ index_label=None,
+ schema=None,
+ chunksize=None,
+ dtype=None,
+ method=None,
+ ):
"""
Write records stored in a DataFrame to a SQL database.
@@ -1152,24 +1298,33 @@ def to_sql(self, frame, name, if_exists='fail', index=True,
if dtype is not None:
from sqlalchemy.types import to_instance, TypeEngine
+
for col, my_type in dtype.items():
if not isinstance(to_instance(my_type), TypeEngine):
- raise ValueError('The type of {column} is not a '
- 'SQLAlchemy type '.format(column=col))
+ raise ValueError(
+ "The type of {column} is not a "
+ "SQLAlchemy type ".format(column=col)
+ )
- table = SQLTable(name, self, frame=frame, index=index,
- if_exists=if_exists, index_label=index_label,
- schema=schema, dtype=dtype)
+ table = SQLTable(
+ name,
+ self,
+ frame=frame,
+ index=index,
+ if_exists=if_exists,
+ index_label=index_label,
+ schema=schema,
+ dtype=dtype,
+ )
table.create()
table.insert(chunksize, method=method)
- if (not name.isdigit() and not name.islower()):
+ if not name.isdigit() and not name.islower():
# check for potentially case sensitivity issues (GH7815)
# Only check when name is not a number and name is not lower case
engine = self.connectable.engine
with self.connectable.connect() as conn:
table_names = engine.table_names(
- schema=schema or self.meta.schema,
- connection=conn,
+ schema=schema or self.meta.schema, connection=conn
)
if name not in table_names:
msg = (
@@ -1186,20 +1341,19 @@ def tables(self):
def has_table(self, name, schema=None):
return self.connectable.run_callable(
- self.connectable.dialect.has_table,
- name,
- schema or self.meta.schema,
+ self.connectable.dialect.has_table, name, schema or self.meta.schema
)
def get_table(self, table_name, schema=None):
schema = schema or self.meta.schema
if schema:
- tbl = self.meta.tables.get('.'.join([schema, table_name]))
+ tbl = self.meta.tables.get(".".join([schema, table_name]))
else:
tbl = self.meta.tables.get(table_name)
# Avoid casting double-precision floats into decimals
from sqlalchemy import Numeric
+
for column in tbl.columns:
if isinstance(column.type, Numeric):
column.type.asdecimal = False
@@ -1214,8 +1368,9 @@ def drop_table(self, table_name, schema=None):
self.meta.clear()
def _create_sql_schema(self, frame, table_name, keys=None, dtype=None):
- table = SQLTable(table_name, self, frame=frame, index=False, keys=keys,
- dtype=dtype)
+ table = SQLTable(
+ table_name, self, frame=frame, index=False, keys=keys, dtype=dtype
+ )
return str(table.sql_schema())
@@ -1223,13 +1378,13 @@ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None):
# sqlite-specific sql strings and handler class
# dictionary used for readability purposes
_SQL_TYPES = {
- 'string': 'TEXT',
- 'floating': 'REAL',
- 'integer': 'INTEGER',
- 'datetime': 'TIMESTAMP',
- 'date': 'DATE',
- 'time': 'TIME',
- 'boolean': 'INTEGER',
+ "string": "TEXT",
+ "floating": "REAL",
+ "integer": "INTEGER",
+ "datetime": "TIMESTAMP",
+ "date": "DATE",
+ "time": "TIME",
+ "boolean": "INTEGER",
}
@@ -1238,7 +1393,8 @@ def _get_unicode_name(name):
uname = str(name).encode("utf-8", "strict").decode("utf-8")
except UnicodeError:
raise ValueError(
- "Cannot convert identifier to UTF-8: '{name}'".format(name=name))
+ "Cannot convert identifier to UTF-8: '{name}'".format(name=name)
+ )
return uname
@@ -1256,13 +1412,15 @@ def _get_valid_sqlite_name(name):
nul_index = uname.find("\x00")
if nul_index >= 0:
- raise ValueError('SQLite identifier cannot contain NULs')
+ raise ValueError("SQLite identifier cannot contain NULs")
return '"' + uname.replace('"', '""') + '"'
-_SAFE_NAMES_WARNING = ("The spaces in these column names will not be changed. "
- "In pandas versions < 0.14, spaces were converted to "
- "underscores.")
+_SAFE_NAMES_WARNING = (
+ "The spaces in these column names will not be changed. "
+ "In pandas versions < 0.14, spaces were converted to "
+ "underscores."
+)
class SQLiteTable(SQLTable):
@@ -1275,6 +1433,7 @@ def __init__(self, *args, **kwargs):
# GH 8341
# register an adapter callable for datetime.time object
import sqlite3
+
# this will transform time(12,34,56,789) into '12:34:56.000789'
# (this is what sqlalchemy does)
sqlite3.register_adapter(time, lambda _: _.strftime("%H:%M:%S.%f"))
@@ -1290,18 +1449,18 @@ def _execute_create(self):
def insert_statement(self):
names = list(map(str, self.frame.columns))
- wld = '?' # wildcard char
+ wld = "?" # wildcard char
escape = _get_valid_sqlite_name
if self.index is not None:
[names.insert(0, idx) for idx in self.index[::-1]]
bracketed_names = [escape(column) for column in names]
- col_names = ','.join(bracketed_names)
- wildcards = ','.join([wld] * len(names))
- insert_statement = \
- 'INSERT INTO {table} ({columns}) VALUES ({wld})'.format(
- table=escape(self.name), columns=col_names, wld=wildcards)
+ col_names = ",".join(bracketed_names)
+ wildcards = ",".join([wld] * len(names))
+ insert_statement = "INSERT INTO {table} ({columns}) VALUES ({wld})".format(
+ table=escape(self.name), columns=col_names, wld=wildcards
+ )
return insert_statement
def _execute_insert(self, conn, keys, data_iter):
@@ -1314,19 +1473,18 @@ def _create_table_setup(self):
structure of a DataFrame. The first entry will be a CREATE TABLE
statement while the rest will be CREATE INDEX statements.
"""
- column_names_and_types = self._get_column_names_and_types(
- self._sql_type_name
- )
+ column_names_and_types = self._get_column_names_and_types(self._sql_type_name)
- pat = re.compile(r'\s+')
+ pat = re.compile(r"\s+")
column_names = [col_name for col_name, _, _ in column_names_and_types]
if any(map(pat.search, column_names)):
warnings.warn(_SAFE_NAMES_WARNING, stacklevel=6)
escape = _get_valid_sqlite_name
- create_tbl_stmts = [escape(cname) + ' ' + ctype
- for cname, ctype, _ in column_names_and_types]
+ create_tbl_stmts = [
+ escape(cname) + " " + ctype for cname, ctype, _ in column_names_and_types
+ ]
if self.keys is not None and len(self.keys):
if not is_list_like(self.keys):
@@ -1336,19 +1494,31 @@ def _create_table_setup(self):
cnames_br = ", ".join(escape(c) for c in keys)
create_tbl_stmts.append(
"CONSTRAINT {tbl}_pk PRIMARY KEY ({cnames_br})".format(
- tbl=self.name, cnames_br=cnames_br))
-
- create_stmts = ["CREATE TABLE " + escape(self.name) + " (\n" +
- ',\n '.join(create_tbl_stmts) + "\n)"]
+ tbl=self.name, cnames_br=cnames_br
+ )
+ )
+
+ create_stmts = [
+ "CREATE TABLE "
+ + escape(self.name)
+ + " (\n"
+ + ",\n ".join(create_tbl_stmts)
+ + "\n)"
+ ]
- ix_cols = [cname for cname, _, is_index in column_names_and_types
- if is_index]
+ ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index]
if len(ix_cols):
cnames = "_".join(ix_cols)
cnames_br = ",".join(escape(c) for c in ix_cols)
create_stmts.append(
- "CREATE INDEX " + escape("ix_" + self.name + "_" + cnames) +
- "ON " + escape(self.name) + " (" + cnames_br + ")")
+ "CREATE INDEX "
+ + escape("ix_" + self.name + "_" + cnames)
+ + "ON "
+ + escape(self.name)
+ + " ("
+ + cnames_br
+ + ")"
+ )
return create_stmts
@@ -1361,10 +1531,14 @@ def _sql_type_name(self, col):
# Needed for inserting typed data containing NULLs, GH 8778.
col_type = lib.infer_dtype(col, skipna=True)
- if col_type == 'timedelta64':
- warnings.warn("the 'timedelta' type is not supported, and will be "
- "written as integer values (ns frequency) to the "
- "database.", UserWarning, stacklevel=8)
+ if col_type == "timedelta64":
+ warnings.warn(
+ "the 'timedelta' type is not supported, and will be "
+ "written as integer values (ns frequency) to the "
+ "database.",
+ UserWarning,
+ stacklevel=8,
+ )
col_type = "integer"
elif col_type == "datetime64":
@@ -1374,7 +1548,7 @@ def _sql_type_name(self, col):
col_type = "string"
elif col_type == "complex":
- raise ValueError('Complex datatypes not supported')
+ raise ValueError("Complex datatypes not supported")
if col_type not in _SQL_TYPES:
col_type = "string"
@@ -1426,17 +1600,19 @@ def execute(self, *args, **kwargs):
except Exception: # pragma: no cover
ex = DatabaseError(
"Execution failed on sql: {sql}\n{exc}\nunable "
- "to rollback".format(sql=args[0], exc=exc))
+ "to rollback".format(sql=args[0], exc=exc)
+ )
raise_with_traceback(ex)
ex = DatabaseError(
- "Execution failed on sql '{sql}': {exc}".format(
- sql=args[0], exc=exc))
+ "Execution failed on sql '{sql}': {exc}".format(sql=args[0], exc=exc)
+ )
raise_with_traceback(ex)
@staticmethod
- def _query_iterator(cursor, chunksize, columns, index_col=None,
- coerce_float=True, parse_dates=None):
+ def _query_iterator(
+ cursor, chunksize, columns, index_col=None, coerce_float=True, parse_dates=None
+ ):
"""Return generator through chunked result set"""
while True:
@@ -1447,29 +1623,48 @@ def _query_iterator(cursor, chunksize, columns, index_col=None,
cursor.close()
break
else:
- yield _wrap_result(data, columns, index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates)
+ yield _wrap_result(
+ data,
+ columns,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ )
- def read_query(self, sql, index_col=None, coerce_float=True, params=None,
- parse_dates=None, chunksize=None):
+ def read_query(
+ self,
+ sql,
+ index_col=None,
+ coerce_float=True,
+ params=None,
+ parse_dates=None,
+ chunksize=None,
+ ):
args = _convert_params(sql, params)
cursor = self.execute(*args)
columns = [col_desc[0] for col_desc in cursor.description]
if chunksize is not None:
- return self._query_iterator(cursor, chunksize, columns,
- index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates)
+ return self._query_iterator(
+ cursor,
+ chunksize,
+ columns,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ )
else:
data = self._fetchall_as_list(cursor)
cursor.close()
- frame = _wrap_result(data, columns, index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates)
+ frame = _wrap_result(
+ data,
+ columns,
+ index_col=index_col,
+ coerce_float=coerce_float,
+ parse_dates=parse_dates,
+ )
return frame
def _fetchall_as_list(self, cur):
@@ -1478,9 +1673,18 @@ def _fetchall_as_list(self, cur):
result = list(result)
return result
- def to_sql(self, frame, name, if_exists='fail', index=True,
- index_label=None, schema=None, chunksize=None, dtype=None,
- method=None):
+ def to_sql(
+ self,
+ frame,
+ name,
+ if_exists="fail",
+ index=True,
+ index_label=None,
+ schema=None,
+ chunksize=None,
+ dtype=None,
+ method=None,
+ ):
"""
Write records stored in a DataFrame to a SQL database.
@@ -1527,12 +1731,21 @@ def to_sql(self, frame, name, if_exists='fail', index=True,
if dtype is not None:
for col, my_type in dtype.items():
if not isinstance(my_type, str):
- raise ValueError('{column} ({type!s}) not a string'.format(
- column=col, type=my_type))
+ raise ValueError(
+ "{column} ({type!s}) not a string".format(
+ column=col, type=my_type
+ )
+ )
- table = SQLiteTable(name, self, frame=frame, index=index,
- if_exists=if_exists, index_label=index_label,
- dtype=dtype)
+ table = SQLiteTable(
+ name,
+ self,
+ frame=frame,
+ index=index,
+ if_exists=if_exists,
+ index_label=index_label,
+ dtype=dtype,
+ )
table.create()
table.insert(chunksize, method)
@@ -1541,23 +1754,24 @@ def has_table(self, name, schema=None):
# escape = _get_valid_sqlite_name
# esc_name = escape(name)
- wld = '?'
- query = ("SELECT name FROM sqlite_master "
- "WHERE type='table' AND name={wld};").format(wld=wld)
+ wld = "?"
+ query = (
+ "SELECT name FROM sqlite_master " "WHERE type='table' AND name={wld};"
+ ).format(wld=wld)
- return len(self.execute(query, [name, ]).fetchall()) > 0
+ return len(self.execute(query, [name]).fetchall()) > 0
def get_table(self, table_name, schema=None):
return None # not supported in fallback mode
def drop_table(self, name, schema=None):
- drop_sql = "DROP TABLE {name}".format(
- name=_get_valid_sqlite_name(name))
+ drop_sql = "DROP TABLE {name}".format(name=_get_valid_sqlite_name(name))
self.execute(drop_sql)
def _create_sql_schema(self, frame, table_name, keys=None, dtype=None):
- table = SQLiteTable(table_name, self, frame=frame, index=False,
- keys=keys, dtype=dtype)
+ table = SQLiteTable(
+ table_name, self, frame=frame, index=False, keys=keys, dtype=dtype
+ )
return str(table.sql_schema())
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 00b7a29b27b63..7087d2ee963cb 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -26,20 +26,31 @@
from pandas.util._decorators import Appender, deprecate_kwarg
from pandas.core.dtypes.common import (
- ensure_object, is_categorical_dtype, is_datetime64_dtype)
+ ensure_object,
+ is_categorical_dtype,
+ is_datetime64_dtype,
+)
from pandas import (
- Categorical, DatetimeIndex, NaT, Timestamp, concat, isna, to_datetime,
- to_timedelta)
+ Categorical,
+ DatetimeIndex,
+ NaT,
+ Timestamp,
+ concat,
+ isna,
+ to_datetime,
+ to_timedelta,
+)
from pandas.core.frame import DataFrame
from pandas.core.series import Series
-from pandas.io.common import (
- BaseIterator, _stringify_path, get_filepath_or_buffer)
+from pandas.io.common import BaseIterator, _stringify_path, get_filepath_or_buffer
-_version_error = ("Version of given Stata file is not 104, 105, 108, "
- "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
- "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")
+_version_error = (
+ "Version of given Stata file is not 104, 105, 108, "
+ "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
+ "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)"
+)
_statafile_processing_params1 = """\
convert_dates : boolean, defaults to True
@@ -111,9 +122,13 @@
>>> itr = pd.read_stata('filename.dta', chunksize=10000)
>>> for chunk in itr:
... do_something(chunk)
-""" % (_statafile_processing_params1, _encoding_params,
- _statafile_processing_params2, _chunksize_params,
- _iterator_params)
+""" % (
+ _statafile_processing_params1,
+ _encoding_params,
+ _statafile_processing_params2,
+ _chunksize_params,
+ _iterator_params,
+)
_data_method_doc = """\
Read observations from Stata file, converting them into a dataframe
@@ -129,7 +144,10 @@
Returns
-------
DataFrame
-""" % (_statafile_processing_params1, _statafile_processing_params2)
+""" % (
+ _statafile_processing_params1,
+ _statafile_processing_params2,
+)
_read_method_doc = """\
Reads observations from Stata file, converting them into a dataframe
@@ -144,7 +162,10 @@
Returns
-------
DataFrame
-""" % (_statafile_processing_params1, _statafile_processing_params2)
+""" % (
+ _statafile_processing_params1,
+ _statafile_processing_params2,
+)
_stata_reader_doc = """\
@@ -161,26 +182,42 @@
%s
%s
%s
-""" % (_statafile_processing_params1, _statafile_processing_params2,
- _encoding_params, _chunksize_params)
+""" % (
+ _statafile_processing_params1,
+ _statafile_processing_params2,
+ _encoding_params,
+ _chunksize_params,
+)
@Appender(_read_stata_doc)
-@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
-@deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
-def read_stata(filepath_or_buffer, convert_dates=True,
- convert_categoricals=True, encoding=None, index_col=None,
- convert_missing=False, preserve_dtypes=True, columns=None,
- order_categoricals=True, chunksize=None, iterator=False):
-
- reader = StataReader(filepath_or_buffer,
- convert_dates=convert_dates,
- convert_categoricals=convert_categoricals,
- index_col=index_col, convert_missing=convert_missing,
- preserve_dtypes=preserve_dtypes,
- columns=columns,
- order_categoricals=order_categoricals,
- chunksize=chunksize)
+@deprecate_kwarg(old_arg_name="encoding", new_arg_name=None)
+@deprecate_kwarg(old_arg_name="index", new_arg_name="index_col")
+def read_stata(
+ filepath_or_buffer,
+ convert_dates=True,
+ convert_categoricals=True,
+ encoding=None,
+ index_col=None,
+ convert_missing=False,
+ preserve_dtypes=True,
+ columns=None,
+ order_categoricals=True,
+ chunksize=None,
+ iterator=False,
+):
+
+ reader = StataReader(
+ filepath_or_buffer,
+ convert_dates=convert_dates,
+ convert_categoricals=convert_categoricals,
+ index_col=index_col,
+ convert_missing=convert_missing,
+ preserve_dtypes=preserve_dtypes,
+ columns=columns,
+ order_categoricals=order_categoricals,
+ chunksize=chunksize,
+ )
if iterator or chunksize:
data = reader
@@ -261,12 +298,12 @@ def convert_year_month_safe(year, month):
using datetime.
"""
if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
- return to_datetime(100 * year + month, format='%Y%m')
+ return to_datetime(100 * year + month, format="%Y%m")
else:
- index = getattr(year, 'index', None)
+ index = getattr(year, "index", None)
return Series(
- [datetime.datetime(y, m, 1) for y, m in zip(year, month)],
- index=index)
+ [datetime.datetime(y, m, 1) for y, m in zip(year, month)], index=index
+ )
def convert_year_days_safe(year, days):
"""
@@ -274,12 +311,13 @@ def convert_year_days_safe(year, days):
datetime or datetime64 Series
"""
if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
- return (to_datetime(year, format='%Y') +
- to_timedelta(days, unit='d'))
+ return to_datetime(year, format="%Y") + to_timedelta(days, unit="d")
else:
- index = getattr(year, 'index', None)
- value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d))
- for y, d in zip(year, days)]
+ index = getattr(year, "index", None)
+ value = [
+ datetime.datetime(y, 1, 1) + relativedelta(days=int(d))
+ for y, d in zip(year, days)
+ ]
return Series(value, index=index)
def convert_delta_safe(base, deltas, unit):
@@ -288,18 +326,19 @@ def convert_delta_safe(base, deltas, unit):
versions if the deltas satisfy restrictions required to be expressed
as dates in pandas.
"""
- index = getattr(deltas, 'index', None)
- if unit == 'd':
+ index = getattr(deltas, "index", None)
+ if unit == "d":
if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
values = [base + relativedelta(days=int(d)) for d in deltas]
return Series(values, index=index)
- elif unit == 'ms':
+ elif unit == "ms":
if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
- values = [base + relativedelta(microseconds=(int(d) * 1000))
- for d in deltas]
+ values = [
+ base + relativedelta(microseconds=(int(d) * 1000)) for d in deltas
+ ]
return Series(values, index=index)
else:
- raise ValueError('format not understood')
+ raise ValueError("format not understood")
base = to_datetime(base)
deltas = to_timedelta(deltas, unit=unit)
return base + deltas
@@ -317,11 +356,10 @@ def convert_delta_safe(base, deltas, unit):
if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
base = stata_epoch
ms = dates
- conv_dates = convert_delta_safe(base, ms, 'ms')
+ conv_dates = convert_delta_safe(base, ms, "ms")
elif fmt.startswith(("%tC", "tC")):
- warnings.warn("Encountered %tC format. Leaving in Stata "
- "Internal Format.")
+ warnings.warn("Encountered %tC format. Leaving in Stata " "Internal Format.")
conv_dates = Series(dates, dtype=np.object)
if has_bad_values:
conv_dates[bad_locs] = NaT
@@ -330,7 +368,7 @@ def convert_delta_safe(base, deltas, unit):
elif fmt.startswith(("%td", "td", "%d", "d")):
base = stata_epoch
days = dates
- conv_dates = convert_delta_safe(base, days, 'd')
+ conv_dates = convert_delta_safe(base, days, "d")
# does not count leap days - 7 days is a week.
# 52nd week may have more than 7 days
elif fmt.startswith(("%tw", "tw")):
@@ -383,34 +421,35 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
if is_datetime64_dtype(dates.values):
if delta:
delta = dates - stata_epoch
- d['delta'] = delta.values.astype(
- np.int64) // 1000 # microseconds
+ d["delta"] = delta.values.astype(np.int64) // 1000 # microseconds
if days or year:
dates = DatetimeIndex(dates)
- d['year'], d['month'] = dates.year, dates.month
+ d["year"], d["month"] = dates.year, dates.month
if days:
- days = (dates.astype(np.int64) -
- to_datetime(d['year'], format='%Y').astype(np.int64))
- d['days'] = days // NS_PER_DAY
+ days = dates.astype(np.int64) - to_datetime(
+ d["year"], format="%Y"
+ ).astype(np.int64)
+ d["days"] = days // NS_PER_DAY
- elif infer_dtype(dates, skipna=False) == 'datetime':
+ elif infer_dtype(dates, skipna=False) == "datetime":
if delta:
delta = dates.values - stata_epoch
- f = lambda x: \
- US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds
+ f = lambda x: US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds
v = np.vectorize(f)
- d['delta'] = v(delta)
+ d["delta"] = v(delta)
if year:
year_month = dates.apply(lambda x: 100 * x.year + x.month)
- d['year'] = year_month.values // 100
- d['month'] = (year_month.values - d['year'] * 100)
+ d["year"] = year_month.values // 100
+ d["month"] = year_month.values - d["year"] * 100
if days:
f = lambda x: (x - datetime.datetime(x.year, 1, 1)).days
v = np.vectorize(f)
- d['days'] = v(dates)
+ d["days"] = v(dates)
else:
- raise ValueError('Columns containing dates must contain either '
- 'datetime64, datetime.datetime or null values.')
+ raise ValueError(
+ "Columns containing dates must contain either "
+ "datetime64, datetime.datetime or null values."
+ )
return DataFrame(d, index=index)
@@ -434,26 +473,26 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
conv_dates = d.delta // US_PER_DAY
elif fmt in ["%tw", "tw"]:
d = parse_dates_safe(dates, year=True, days=True)
- conv_dates = (52 * (d.year - stata_epoch.year) + d.days // 7)
+ conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7
elif fmt in ["%tm", "tm"]:
d = parse_dates_safe(dates, year=True)
- conv_dates = (12 * (d.year - stata_epoch.year) + d.month - 1)
+ conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1
elif fmt in ["%tq", "tq"]:
d = parse_dates_safe(dates, year=True)
conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3
elif fmt in ["%th", "th"]:
d = parse_dates_safe(dates, year=True)
- conv_dates = (2 * (d.year - stata_epoch.year) +
- (d.month > 6).astype(np.int))
+ conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(np.int)
elif fmt in ["%ty", "ty"]:
d = parse_dates_safe(dates, year=True)
conv_dates = d.year
else:
raise ValueError(
- "Format {fmt} is not a known Stata date format".format(fmt=fmt))
+ "Format {fmt} is not a known Stata date format".format(fmt=fmt)
+ )
conv_dates = Series(conv_dates, dtype=np.float64)
- missing_value = struct.unpack('= 2 ** 53:
- ws = precision_loss_doc % ('uint64', 'float64')
+ ws = precision_loss_doc % ("uint64", "float64")
data[col] = data[col].astype(dtype)
@@ -561,28 +602,31 @@ def _cast_to_stata_types(data):
if data[col].max() > 32740 or data[col].min() < -32767:
data[col] = data[col].astype(np.int32)
elif dtype == np.int64:
- if (data[col].max() <= 2147483620 and
- data[col].min() >= -2147483647):
+ if data[col].max() <= 2147483620 and data[col].min() >= -2147483647:
data[col] = data[col].astype(np.int32)
else:
data[col] = data[col].astype(np.float64)
if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53:
- ws = precision_loss_doc % ('int64', 'float64')
+ ws = precision_loss_doc % ("int64", "float64")
elif dtype in (np.float32, np.float64):
value = data[col].max()
if np.isinf(value):
- raise ValueError('Column {col} has a maximum value of '
- 'infinity which is outside the range '
- 'supported by Stata.'.format(col=col))
+ raise ValueError(
+ "Column {col} has a maximum value of "
+ "infinity which is outside the range "
+ "supported by Stata.".format(col=col)
+ )
if dtype == np.float32 and value > float32_max:
data[col] = data[col].astype(np.float64)
elif dtype == np.float64:
if value > float64_max:
- raise ValueError('Column {col} has a maximum value '
- '({val}) outside the range supported by '
- 'Stata ({float64_max})'
- .format(col=col, val=value,
- float64_max=float64_max))
+ raise ValueError(
+ "Column {col} has a maximum value "
+ "({val}) outside the range supported by "
+ "Stata ({float64_max})".format(
+ col=col, val=value, float64_max=float64_max
+ )
+ )
if ws:
warnings.warn(ws, PossiblePrecisionLoss)
@@ -630,8 +674,10 @@ def __init__(self, catarray):
category = vl[1]
if not isinstance(category, str):
category = str(category)
- warnings.warn(value_label_mismatch_doc.format(catarray.name),
- ValueLabelTypeMismatch)
+ warnings.warn(
+ value_label_mismatch_doc.format(catarray.name),
+ ValueLabelTypeMismatch,
+ )
self.off.append(self.text_len)
self.text_len += len(category) + 1 # +1 for the padding
@@ -640,9 +686,11 @@ def __init__(self, catarray):
self.n += 1
if self.text_len > 32000:
- raise ValueError('Stata value labels for a single variable must '
- 'have a combined length less than 32,000 '
- 'characters.')
+ raise ValueError(
+ "Stata value labels for a single variable must "
+ "have a combined length less than 32,000 "
+ "characters."
+ )
# Ensure int32
self.off = np.array(self.off, dtype=np.int32)
@@ -674,11 +722,11 @@ def generate_value_label(self, byteorder, encoding):
self._encoding = encoding
bio = BytesIO()
- null_string = '\x00'
- null_byte = b'\x00'
+ null_string = "\x00"
+ null_byte = b"\x00"
# len
- bio.write(struct.pack(byteorder + 'i', self.len))
+ bio.write(struct.pack(byteorder + "i", self.len))
# labname
labname = self._encode(_pad_bytes(self.labname[:32], 33))
@@ -686,22 +734,22 @@ def generate_value_label(self, byteorder, encoding):
# padding - 3 bytes
for i in range(3):
- bio.write(struct.pack('c', null_byte))
+ bio.write(struct.pack("c", null_byte))
# value_label_table
# n - int32
- bio.write(struct.pack(byteorder + 'i', self.n))
+ bio.write(struct.pack(byteorder + "i", self.n))
# textlen - int32
- bio.write(struct.pack(byteorder + 'i', self.text_len))
+ bio.write(struct.pack(byteorder + "i", self.text_len))
# off - int32 array (n elements)
for offset in self.off:
- bio.write(struct.pack(byteorder + 'i', offset))
+ bio.write(struct.pack(byteorder + "i", offset))
# val - int32 array (n elements)
for value in self.val:
- bio.write(struct.pack(byteorder + 'i', value))
+ bio.write(struct.pack(byteorder + "i", value))
# txt - Text labels, null terminated
for text in self.txt:
@@ -760,36 +808,37 @@ class StataMissingValue:
bases = (101, 32741, 2147483621)
for b in bases:
# Conversion to long to avoid hash issues on 32 bit platforms #8968
- MISSING_VALUES[b] = '.'
+ MISSING_VALUES[b] = "."
for i in range(1, 27):
- MISSING_VALUES[i + b] = '.' + chr(96 + i)
+ MISSING_VALUES[i + b] = "." + chr(96 + i)
- float32_base = b'\x00\x00\x00\x7f'
- increment = struct.unpack(' 0:
MISSING_VALUES[value] += chr(96 + i)
- int_value = struct.unpack(' 0:
MISSING_VALUES[value] += chr(96 + i)
- int_value = struct.unpack('q', struct.pack(' 0
+ self.has_string_data = len([x for x in self.typlist if type(x) is int]) > 0
# calculate size of a data record
self.col_sizes = [self._calcsize(typ) for typ in self.typlist]
@@ -1038,10 +1135,9 @@ def _read_new_header(self, first_char):
raise ValueError(_version_error)
self._set_encoding()
self.path_or_buf.read(21) #
- self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
+ self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<"
self.path_or_buf.read(15) #
- self.nvar = struct.unpack(self.byteorder + 'H',
- self.path_or_buf.read(2))[0]
+ self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0]
self.path_or_buf.read(7) #
self.nobs = self._get_nobs()
@@ -1053,27 +1149,35 @@ def _read_new_header(self, first_char):
self.path_or_buf.read(8) # 0x0000000000000000
self.path_or_buf.read(8) # position of