Merge branch 'main' into fix-doc-issue-60366

pandas-dev · Nov 23, 2024 · 62c10af · 62c10af
2 parents d409d9d + ee0902a
commit 62c10af
Show file tree

Hide file tree

Showing 21 changed files with 276 additions and 42 deletions.
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -73,7 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Period.freq GL08" \
         -i "pandas.Period.ordinal GL08" \
         -i "pandas.RangeIndex.from_range PR01,SA01" \
-        -i "pandas.Series.dt.freq GL08" \
         -i "pandas.Series.dt.unit GL08" \
         -i "pandas.Series.pad PR01,SA01" \
         -i "pandas.Timedelta.max PR02" \
@@ -92,15 +91,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \
         -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
         -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \
-        -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \
-        -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \
-        -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \
         -i "pandas.core.resample.Resampler.get_group RT03,SA01" \
@@ -114,8 +109,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.resample.Resampler.std SA01" \
         -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.var SA01" \
-        -i "pandas.errors.AttributeConflictWarning SA01" \
-        -i "pandas.errors.ChainedAssignmentError SA01" \
         -i "pandas.errors.DuplicateLabelError SA01" \
         -i "pandas.errors.IntCastingNaNError SA01" \
         -i "pandas.errors.InvalidIndexError SA01" \

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -54,6 +54,7 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
+- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
 - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
 - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
@@ -626,6 +627,7 @@ Datetimelike
 - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`)
 - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`)
 - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`)
+- Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`)
 - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)
 
 Timedelta
@@ -688,6 +690,7 @@ I/O
 - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`)
 - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
 - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
+- Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`)
 - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
 - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`)
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
@@ -762,7 +765,7 @@ ExtensionArray
 
 Styler
 ^^^^^^
--
+- Bug in :meth:`Styler.to_latex` where styling column headers when combined with a hidden index or hidden index-levels is fixed.
 
 Other
 ^^^^^

diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
@@ -660,11 +660,12 @@ void pandas_datetime_to_datetimestruct(npy_datetime dt, NPY_DATETIMEUNIT base,
     perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000;
 
     set_datetimestruct_days(extract_unit(&dt, perday), out);
-    out->hour = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60);
-    out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 60);
-    out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000);
-    out->us = (npy_int32)extract_unit(&dt, 1000LL);
-    out->ps = (npy_int32)(dt * 1000);
+    out->hour =
+        (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60 * 60);
+    out->min = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 * 60);
+    out->sec = (npy_int32)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000);
+    out->us = (npy_int32)extract_unit(&dt, 1000LL * 1000);
+    out->ps = (npy_int32)(dt);
     break;
 
   case NPY_FR_fs:

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4742,7 +4742,8 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
         3  4   4    7   8  0
         4  5   2    6   7  3
 
-        For columns with spaces in their name, you can use backtick quoting.
+        For columns with spaces or other disallowed characters in their name, you can
+        use backtick quoting.
 
         >>> df.eval("B * `C&C`")
         0    100

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1443,6 +1443,11 @@ def is_monotonic_increasing(self) -> Series:
         -------
         Series
 
+        See Also
+        --------
+        SeriesGroupBy.is_monotonic_decreasing : Return whether each group's values
+            are monotonically decreasing.
+
         Examples
         --------
         >>> s = pd.Series([2, 1, 3, 4], index=["Falcon", "Falcon", "Parrot", "Parrot"])
@@ -1462,6 +1467,11 @@ def is_monotonic_decreasing(self) -> Series:
         -------
         Series
 
+        See Also
+        --------
+        SeriesGroupBy.is_monotonic_increasing : Return whether each group's values
+            are monotonically increasing.
+
         Examples
         --------
         >>> s = pd.Series([2, 1, 3, 4], index=["Falcon", "Falcon", "Parrot", "Parrot"])

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -3983,19 +3983,6 @@ def nth(self) -> GroupByNthSelector:
         'all' or 'any'; this is equivalent to calling dropna(how=dropna)
         before the groupby.
 
-        Parameters
-        ----------
-        n : int, slice or list of ints and slices
-            A single nth value for the row or a list of nth values or slices.
-
-            .. versionchanged:: 1.4.0
-                Added slice and lists containing slices.
-                Added index notation.
-
-        dropna : {'any', 'all', None}, default None
-            Apply the specified dropna operation before counting which row is
-            the nth row. Only supported if n is an int.
-
         Returns
         -------
         Series or DataFrame

diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py
@@ -373,6 +373,28 @@ def to_pydatetime(self) -> Series:
 
     @property
     def freq(self):
+        """
+        Tries to return a string representing a frequency generated by infer_freq.
+
+        Returns None if it can't autodetect the frequency.
+
+        See Also
+        --------
+        Series.dt.to_period : Cast to PeriodArray/PeriodIndex at a particular
+            frequency.
+
+        Examples
+        --------
+        >>> ser = pd.Series(["2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04"])
+        >>> ser = pd.to_datetime(ser)
+        >>> ser.dt.freq
+        'D'
+
+        >>> ser = pd.Series(["2022-01-01", "2024-01-01", "2026-01-01", "2028-01-01"])
+        >>> ser = pd.to_datetime(ser)
+        >>> ser.dt.freq
+        '2YS-JAN'
+        """
         return self._get_values().inferred_freq
 
     def isocalendar(self) -> DataFrame:

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -567,7 +567,7 @@ def __arrow_c_stream__(self, requested_schema=None):
         Export the pandas Series as an Arrow C stream PyCapsule.
 
         This relies on pyarrow to convert the pandas Series to the Arrow
-        format (and follows the default behaviour of ``pyarrow.Array.from_pandas``
+        format (and follows the default behavior of ``pyarrow.Array.from_pandas``
         in its handling of the index, i.e. to ignore it).
         This conversion is not necessarily zero-copy.
 
@@ -2226,7 +2226,7 @@ def drop_duplicates(
         5     hippo
         Name: animal, dtype: object
 
-        With the 'keep' parameter, the selection behaviour of duplicated values
+        With the 'keep' parameter, the selection behavior of duplicated values
         can be changed. The value 'first' keeps the first occurrence for each
         set of duplicated entries. The default value of keep is 'first'.
 
@@ -3451,7 +3451,7 @@ def sort_values(
         4     5.0
         dtype: float64
 
-        Sort values ascending order (default behaviour)
+        Sort values ascending order (default behavior)
 
         >>> s.sort_values(ascending=True)
         1     1.0
@@ -4098,7 +4098,7 @@ def swaplevel(
 
         In the following example, we will swap the levels of the indices.
         Here, we will swap the levels column-wise, but levels can be swapped row-wise
-        in a similar manner. Note that column-wise is the default behaviour.
+        in a similar manner. Note that column-wise is the default behavior.
         By not supplying any arguments for i and j, we swap the last and second to
         last indices.
 

diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
@@ -487,6 +487,11 @@ class ChainedAssignmentError(Warning):
     For more information on Copy-on-Write,
     see :ref:`the user guide<copy_on_write>`.
 
+    See Also
+    --------
+    options.mode.copy_on_write : Global setting for enabling or disabling
+        Copy-on-Write behavior.
+
     Examples
     --------
     >>> pd.options.mode.copy_on_write = True
@@ -672,6 +677,12 @@ class AttributeConflictWarning(Warning):
     name than the existing index on an HDFStore or attempting to append an index with a
     different frequency than the existing index on an HDFStore.
 
+    See Also
+    --------
+    HDFStore : Dict-like IO interface for storing pandas objects in PyTables.
+    DataFrame.to_hdf : Write the contained data to an HDF5 file using HDFStore.
+    read_hdf : Read from an HDF5 file into a DataFrame.
+
     Examples
     --------
     >>> idx1 = pd.Index(["a", "b"], name="name1")

diff --git a/pandas/io/_util.py b/pandas/io/_util.py
@@ -60,9 +60,12 @@ def arrow_table_to_pandas(
     table: pyarrow.Table,
     dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
     null_to_int64: bool = False,
+    to_pandas_kwargs: dict | None = None,
 ) -> pd.DataFrame:
     pa = import_optional_dependency("pyarrow")
 
+    to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs
+
     types_mapper: type[pd.ArrowDtype] | None | Callable
     if dtype_backend == "numpy_nullable":
         mapping = _arrow_dtype_mapping()
@@ -80,5 +83,5 @@ def arrow_table_to_pandas(
     else:
         raise NotImplementedError
 
-    df = table.to_pandas(types_mapper=types_mapper)
+    df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
     return df
diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py
@@ -37,6 +37,7 @@
     DataFrame,
     Index,
     MultiIndex,
+    Period,
     PeriodIndex,
 )
 import pandas.core.common as com
@@ -803,6 +804,9 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]:
                         allow_fill=levels._can_hold_na,
                         fill_value=levels._na_value,
                     )
+                    # GH#60099
+                    if isinstance(values[0], Period):
+                        values = values.to_timestamp()
 
                     for i, span_val in spans.items():
                         mergestart, mergeend = None, None
@@ -827,6 +831,10 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]:
                 # Format hierarchical rows with non-merged values.
                 for indexcolvals in zip(*self.df.index):
                     for idx, indexcolval in enumerate(indexcolvals):
+                        # GH#60099
+                        if isinstance(indexcolval, Period):
+                            indexcolval = indexcolval.to_timestamp()
+
                         yield CssExcelCell(
                             row=self.rowcounter + idx,
                             col=gcolidx,

diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py
@@ -868,7 +868,8 @@ def _translate_latex(self, d: dict, clines: str | None) -> None:
             or multirow sparsification (so that \multirow and \multicol work correctly).
         """
         index_levels = self.index.nlevels
-        visible_index_level_n = index_levels - sum(self.hide_index_)
+        # GH 52218
+        visible_index_level_n = max(1, index_levels - sum(self.hide_index_))
         d["head"] = [
             [
                 {**col, "cellstyle": self.ctx_columns[r, c - visible_index_level_n]}

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -242,6 +242,7 @@ def read(
         dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
         storage_options: StorageOptions | None = None,
         filesystem=None,
+        to_pandas_kwargs: dict[str, Any] | None = None,
         **kwargs,
     ) -> DataFrame:
         kwargs["use_pandas_metadata"] = True
@@ -266,7 +267,11 @@ def read(
                     "make_block is deprecated",
                     DeprecationWarning,
                 )
-                result = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
+                result = arrow_table_to_pandas(
+                    pa_table,
+                    dtype_backend=dtype_backend,
+                    to_pandas_kwargs=to_pandas_kwargs,
+                )
 
             if pa_table.schema.metadata:
                 if b"PANDAS_ATTRS" in pa_table.schema.metadata:
@@ -347,6 +352,7 @@ def read(
         filters=None,
         storage_options: StorageOptions | None = None,
         filesystem=None,
+        to_pandas_kwargs: dict | None = None,
         **kwargs,
     ) -> DataFrame:
         parquet_kwargs: dict[str, Any] = {}
@@ -362,6 +368,10 @@ def read(
             raise NotImplementedError(
                 "filesystem is not implemented for the fastparquet engine."
             )
+        if to_pandas_kwargs is not None:
+            raise NotImplementedError(
+                "to_pandas_kwargs is not implemented for the fastparquet engine."
+            )
         path = stringify_path(path)
         handles = None
         if is_fsspec_url(path):
@@ -452,7 +462,7 @@ def to_parquet(
         .. versionadded:: 2.1.0
 
     kwargs
-        Additional keyword arguments passed to the engine
+        Additional keyword arguments passed to the engine.
 
     Returns
     -------
@@ -491,6 +501,7 @@ def read_parquet(
     dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
     filesystem: Any = None,
     filters: list[tuple] | list[list[tuple]] | None = None,
+    to_pandas_kwargs: dict | None = None,
     **kwargs,
 ) -> DataFrame:
     """
@@ -564,6 +575,12 @@ def read_parquet(
 
         .. versionadded:: 2.1.0
 
+    to_pandas_kwargs : dict | None, default None
+        Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas`
+        when ``engine="pyarrow"``.
+
+        .. versionadded:: 3.0.0
+
     **kwargs
         Any additional kwargs are passed to the engine.
 
@@ -636,5 +653,6 @@ def read_parquet(
         storage_options=storage_options,
         dtype_backend=dtype_backend,
         filesystem=filesystem,
+        to_pandas_kwargs=to_pandas_kwargs,
         **kwargs,
     )