Merge pull request #90 from ImperialCollegeLondon/add-temperature-data

Add methods for importing external data
ImperialCollegeLondon · Aug 14, 2024 · d3699ea · d3699ea
2 parents 8846410 + 0bf97dc
commit d3699ea
Show file tree

Hide file tree

Showing 20 changed files with 424 additions and 125 deletions.
diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml
@@ -21,8 +21,8 @@ jobs:
     - name: Gather dependencies
       run: |
         pip install -U pip
-        pip install .
         pip install -r requirements-dev.txt
+        pip install .
         pip install ipykernel
         sudo apt-get install pandoc
     

diff --git a/docs/source/developer_guide/developer_installation.rst b/docs/source/developer_guide/developer_installation.rst
@@ -4,7 +4,7 @@ Installation
 To install PyProBE you must be running Python 3.11 or later. It is recommended to use a 
 virtual environment to install PyProBE, for example venv or conda.
 
-The steps to install PyProBE wih developer settings are as follows:
+The steps to install PyProBE with developer settings are as follows:
 
 1. Enter a directory in which you wish to install PyProBE:
 

diff --git a/docs/source/user_guide/importing_data.rst b/docs/source/user_guide/importing_data.rst
@@ -174,6 +174,17 @@ parametrically. The steps are as follows:
                           filename = output_name_generator,
                           filename_inputs = ["Cycler", "Channel"])
 
+Adding data not from a cycler
+-----------------------------
+In your battery experiment, it is likely that you will be collecting data from sources
+additional to your battery cycler. This can be added to your `~pyprobe.filters.Procedure`
+object after it has been created with its :func:`~pyprobe.filters.Procedure.add_external_data`
+method.
+
+The data that you provide must be timeseries, with a column that can be interpreted in
+DateTime format. This is usually a string that may appear like: `"2024-02-29 09:19:58.554"`.
+PyProBE will interpolate your data into the time series of the cycling data already there,
+so it will be filtered as normal.
 
 
 .. footbibliography::
diff --git a/pyprobe/analysis/cycling.py b/pyprobe/analysis/cycling.py
@@ -47,22 +47,22 @@ def summary(self, dchg_before_chg: bool = True) -> Result:
         )
 
         self._create_capacity_throughput()
-        lf_capacity_throughput = self.input_data.base_dataframe.groupby(
+        lf_capacity_throughput = self.input_data.base_dataframe.group_by(
             "Cycle", maintain_order=True
         ).agg(pl.col("Capacity Throughput [Ah]").first())
-        lf_time = self.input_data.base_dataframe.groupby(
+        lf_time = self.input_data.base_dataframe.group_by(
             "Cycle", maintain_order=True
         ).agg(pl.col("Time [s]").first())
 
         lf_charge = (
             self.input_data.charge()
-            .base_dataframe.groupby("Cycle", maintain_order=True)
+            .base_dataframe.group_by("Cycle", maintain_order=True)
             .agg(pl.col("Capacity [Ah]").max() - pl.col("Capacity [Ah]").min())
             .rename({"Capacity [Ah]": "Charge Capacity [Ah]"})
         )
         lf_discharge = (
             self.input_data.discharge()
-            .base_dataframe.groupby("Cycle", maintain_order=True)
+            .base_dataframe.group_by("Cycle", maintain_order=True)
             .agg(pl.col("Capacity [Ah]").max() - pl.col("Capacity [Ah]").min())
             .rename({"Capacity [Ah]": "Discharge Capacity [Ah]"})
         )

diff --git a/pyprobe/cyclers/basecycler.py b/pyprobe/cyclers/basecycler.py
@@ -37,7 +37,6 @@ def model_post_init(self, __context: Any) -> None:
         """Post initialization method for the BaseModel."""
         dataframe_list = self.get_dataframe_list(self.input_data_path)
         self._imported_dataframe = self.get_imported_dataframe(dataframe_list)
-        self._dataframe_columns = self._imported_dataframe.columns
 
     @staticmethod
     def read_file(filepath: str) -> pl.DataFrame | pl.LazyFrame:
@@ -73,10 +72,10 @@ def get_dataframe_list(
         files = glob.glob(input_data_path)
         files = self._sort_files(files)
         list = [self.read_file(file) for file in files]
-        all_columns = set([col for df in list for col in df.columns])
+        all_columns = set([col for df in list for col in df.collect_schema().names()])
         indices_to_remove = []
         for i in range(len(list)):
-            if len(list[i].columns) < len(all_columns):
+            if len(list[i].collect_schema().names()) < len(all_columns):
                 indices_to_remove.append(i)
                 warnings.warn(
                     f"File {os.path.basename(files[i])} has missing columns, "
@@ -151,8 +150,18 @@ def required_columns(self) -> Dict[str, pl.Expr]:
             "Current [A]": self.current,
             "Voltage [V]": self.voltage,
             "Capacity [Ah]": self.capacity,
+            "Temperature [C]": self.temperature,
         }
 
+    @property
+    def _dataframe_columns(self) -> List[str]:
+        """The columns of the DataFrame.
+
+        Returns:
+            List[str]: The columns.
+        """
+        return self._imported_dataframe.collect_schema().names()
+
     @property
     def pyprobe_dataframe(self) -> pl.DataFrame:
         """The DataFrame containing the required columns.
@@ -171,7 +180,7 @@ def date(self) -> pl.Expr:
         """
         if (
             self._imported_dataframe.dtypes[
-                self._imported_dataframe.columns.index("Date")
+                self._imported_dataframe.collect_schema().names().index("Date")
             ]
             != pl.Datetime
         ):
@@ -302,6 +311,28 @@ def capacity(self) -> pl.Expr:
         else:
             return self.capacity_from_ch_dch
 
+    @property
+    def temperature(self) -> pl.Expr:
+        """Identify and format the temperature column.
+
+        An optional column, if not found, a column of None values is returned.
+
+        Returns:
+            pl.Expr: A polars expression for the temperature column.
+        """
+        try:
+            return (
+                self.search_columns(
+                    self._dataframe_columns,
+                    self.column_dict["Temperature"],
+                    self.column_name_pattern,
+                )
+                .to_default_name_and_unit()
+                .cast(pl.Float64)
+            )
+        except ValueError:
+            return pl.lit(None).alias("Temperature [C]")
+
     @property
     def cycle(self) -> pl.Expr:
         """Identify the cycle number.

diff --git a/pyprobe/cyclers/biologic.py b/pyprobe/cyclers/biologic.py
@@ -27,6 +27,7 @@ class Biologic(BaseCycler):
         "Voltage": "Ecell",
         "Charge Capacity": "Q charge",
         "Discharge Capacity": "Q discharge",
+        "Temperature": "Temperature",
     }
 
     @staticmethod
@@ -54,24 +55,14 @@ def read_file(filepath: str) -> pl.DataFrame | pl.LazyFrame:
         _, value = start_time_line.split(" : ")
         start_time = datetime.strptime(value.strip(), "%m/%d/%Y %H:%M:%S.%f")
 
-        columns_to_read = ["time/", "Ns", "I/", "Ecell/", "Q charge/", "Q discharge/"]
-
-        all_columns = pl.scan_csv(
-            filepath, skip_rows=n_header_lines - 1, separator="\t"
-        ).columns
-        selected_columns = []
-        for substring in columns_to_read:
-            found_columns = [col for col in all_columns if substring in col]
-            selected_columns.extend(found_columns)
-
         dataframe = pl.scan_csv(
             filepath,
             skip_rows=n_header_lines - 1,
             separator="\t",
         )
 
         dataframe = dataframe.with_columns(
-            (pl.col("time/s") * 1000000 + pl.lit(start_time))
+            (pl.col("time/s").cast(pl.Duration) + pl.lit(start_time))
             .cast(pl.Datetime)
             .alias("Date")
         )

diff --git a/pyprobe/cyclers/neware.py b/pyprobe/cyclers/neware.py
@@ -23,6 +23,7 @@ class Neware(BaseCycler):
         "Voltage": "Voltage",
         "Charge Capacity": "Chg. Cap.",
         "Discharge Capacity": "DChg. Cap.",
+        "Temperature": "Temperature",
     }
 
     @property

diff --git a/pyprobe/dashboard.py b/pyprobe/dashboard.py
@@ -38,7 +38,7 @@ def dataframe_with_selections(df: pd.DataFrame) -> List[int]:
             df_with_selections,
             hide_index=True,  # Keep the index visible
             column_config={"Select": st.column_config.CheckboxColumn(required=True)},
-            disabled=df.columns,
+            disabled=df.collect_schema().names(),
         )
 
         # Filter the dataframe using the temporary column, then drop the column

diff --git a/pyprobe/filters.py b/pyprobe/filters.py
@@ -1,4 +1,6 @@
 """A module for the filtering classes."""
+import os
+import warnings
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import polars as pl
@@ -304,6 +306,60 @@ def experiment_names(self) -> List[str]:
         """
         return list(self.titles)
 
+    def add_external_data(
+        self,
+        filepath: str,
+        importing_columns: List[str] | Dict[str, str],
+        date_column_name: str = "Date",
+    ) -> None:
+        """Add data from another source to the procedure.
+
+        The data must be timestamped, with a column that can be interpreted in
+        DateTime format. The data will be interpolated to the procedure's time.
+
+        Args:
+            filepath (str): The path to the external file.
+            importing_columns (List[str] | Dict[str, str]):
+                The columns to import from the external file. If a list, the columns
+                will be imported as is. If a dict, the keys are the columns in the data
+                you want to import and the values are the columns you want to rename
+                them to.
+            date_column_name (str, optional):
+                The name of the date column in the external data. Defaults to "Date".
+        """
+        external_data = self.load_external_file(filepath)
+        if isinstance(importing_columns, dict):
+            external_data = external_data.select(
+                [date_column_name] + list(importing_columns.keys())
+            )
+            external_data = external_data.rename(importing_columns)
+        elif isinstance(importing_columns, list):
+            external_data = external_data.select([date_column_name] + importing_columns)
+        self.add_new_data_columns(external_data, date_column_name)
+
+    def load_external_file(self, filepath: str) -> pl.LazyFrame:
+        """Load an external file into a LazyFrame.
+
+        Supported file types are CSV, Parquet, and Excel. For maximum performance,
+        consider using Parquet files. If you have an Excel file, consider converting
+        it to CSV before loading.
+
+        Args:
+            filepath (str): The path to the external file.
+        """
+        file = os.path.basename(filepath)
+        file_ext = os.path.splitext(file)[1]
+        match file_ext:
+            case ".csv":
+                return pl.scan_csv(filepath)
+            case ".parquet":
+                return pl.scan_parquet(filepath)
+            case ".xlsx":
+                warnings.warn("Excel reading is slow. Consider converting to CSV.")
+                return pl.read_excel(filepath)
+            case _:
+                raise ValueError(f"Unsupported file type: {file_ext}")
+
     @classmethod
     def _flatten(cls, lst: int | List[Any]) -> List[int]:
         """Flatten a list of lists into a single list.

diff --git a/pyprobe/rawdata.py b/pyprobe/rawdata.py
@@ -44,7 +44,9 @@ def _check_required_columns(
     ) -> "RawData":
         """Check if the required columns are present in the input_data."""
         missing_columns = [
-            col for col in required_columns if col not in dataframe.columns
+            col
+            for col in required_columns
+            if col not in dataframe.collect_schema().names()
         ]
         if missing_columns:
             raise ValueError(f"Missing required columns: {missing_columns}")

diff --git a/pyprobe/result.py b/pyprobe/result.py
@@ -55,7 +55,7 @@ def __call__(self, column_name: str) -> NDArray[np.float64]:
         )
 
         self._check_units(column_name)
-        if column_name not in self.data.columns:
+        if column_name not in self.data.collect_schema().names():
             raise ValueError(f"Column '{column_name}' not in data.")
         else:
             return self.data[column_name].to_numpy()
@@ -72,7 +72,7 @@ def __getitem__(self, *column_name: str) -> "Result":
         column_names = list(column_name)
         for col in column_names:
             self._check_units(col)
-        if not all(col in self.data.columns for col in column_names):
+        if not all(col in self.data.collect_schema().names() for col in column_names):
             raise ValueError("One or more columns not in data.")
         else:
             return Result(base_dataframe=self.data.select(column_names), info=self.info)
@@ -145,7 +145,7 @@ def _get_filtered_array(
     ) -> NDArray[np.float64]:
         for column_name in filtering_column_names:
             self._check_units(column_name)
-            if column_name not in self.base_dataframe.columns:
+            if column_name not in self.base_dataframe.collect_schema().names():
                 raise ValueError(f"Column '{column_name}' not in data.")
         frame_to_return = self.base_dataframe.select(filtering_column_names)
         if isinstance(frame_to_return, pl.LazyFrame):
@@ -160,7 +160,7 @@ def _check_units(self, column_name: str) -> None:
         Args:
             column_name (str): The column name to convert to.
         """
-        if column_name not in self.base_dataframe.columns:
+        if column_name not in self.base_dataframe.collect_schema().names():
             converter_object = Units(column_name)
             if converter_object.input_quantity in self.quantities:
                 instruction = converter_object.from_default_unit()
@@ -193,7 +193,7 @@ def quantities(self) -> List[str]:
     @property
     def column_list(self) -> List[str]:
         """Return a list of the columns in the data."""
-        return self.base_dataframe.columns
+        return self.base_dataframe.collect_schema().names()
 
     def define_column(self, column_name: str, definition: str) -> None:
         """Define a new column when it is added to the dataframe.
@@ -230,6 +230,58 @@ def clean_copy(
             column_definitions=column_definitions,
         )
 
+    def add_new_data_columns(
+        self, new_data: pl.DataFrame | pl.LazyFrame, date_column_name: str
+    ) -> None:
+        """Add new data columns to the result object.
+
+        The data must be time series data with a date column.
+
+        Args:
+            new_data (pl.DataFrame | pl.LazyFrame):
+                The new data to add to the result object.
+            date_column_name (str):
+                The name of the column in the new data containing the date.
+        """
+        # get the columns of the new data
+        new_data_cols = new_data.collect_schema().names()
+        new_data_cols.remove(date_column_name)
+        # check if the new data is lazyframe or not
+        is_new_data_lazy = isinstance(new_data, pl.LazyFrame)
+        is_base_dataframe_lazy = isinstance(self.base_dataframe, pl.LazyFrame)
+        if is_new_data_lazy and not is_base_dataframe_lazy:
+            new_data = new_data.collect()
+        elif is_base_dataframe_lazy and not is_new_data_lazy:
+            new_data = new_data.lazy()
+        if (
+            new_data.dtypes[new_data.collect_schema().names().index(date_column_name)]
+            != pl.Datetime
+        ):
+            new_data = new_data.with_columns(pl.col(date_column_name).str.to_datetime())
+
+        # Ensure both DataFrames have DateTime columns in the same unit
+        new_data = new_data.with_columns(
+            pl.col(date_column_name).dt.cast_time_unit("us")
+        )
+        self.base_dataframe = self.base_dataframe.with_columns(
+            pl.col("Date").dt.cast_time_unit("us")
+        )
+
+        new_data = self.base_dataframe.join(
+            new_data,
+            left_on="Date",
+            right_on=date_column_name,
+            how="full",
+            coalesce=True,
+        )
+        new_data = new_data.with_columns(
+            pl.col(new_data_cols).interpolate_by("Date")
+        ).select(pl.col(["Date"] + new_data_cols))
+
+        self.base_dataframe = self.base_dataframe.join(
+            new_data, on="Date", how="left", coalesce=True
+        )
+
     @classmethod
     def build(
         cls,

diff --git a/pyprobe/units.py b/pyprobe/units.py
@@ -17,6 +17,7 @@ class Units:
         "Ah": "Capacity",
         "A.h": "Capacity",
         "s": "Time",
+        "C": "Temperature",
     }
 
     def __init__(
@@ -47,6 +48,7 @@ def _get_default_unit(self, unit: str) -> Tuple[Optional[str], str]:
         Returns:
             Tuple[Optional[str], str]: The prefix and default unit.
         """
+        unit = re.sub(r"[^a-zA-Z]", "", unit)  # Remove non-alphabetic characters
         if unit in self.time_unit_dict.keys():
             return None, "s"
         if unit[0] in self.prefix_dict: