Merge pull request #47 from JuBiotech/drop-nonmonotonic-time

Drop filtersets with non-monotonic time
JuBiotech · Aug 7, 2023 · bad4b7e · bad4b7e
2 parents 2006f0a + 167a139
commit bad4b7e
Show file tree

Hide file tree

Showing 5 changed files with 2,803 additions and 10 deletions.
diff --git a/bletl/__init__.py b/bletl/__init__.py
@@ -20,4 +20,4 @@
     NoMeasurementData,
 )
 
-__version__ = "1.4.0"
+__version__ = "1.4.1"
diff --git a/bletl/parsing/bl1.py b/bletl/parsing/bl1.py
@@ -196,8 +196,8 @@ def parse(
         metadata = extract_metadata(headerlines)
         process_parameters = extract_process_parameters(headerlines)
         filtersets = extract_filtersets(headerlines)
-        comments = extract_comments(data)
         references = extract_references(data)
+        comments = extract_comments(data, references)
         measurements = extract_measurements(data)
 
         data = BLData(
@@ -377,16 +377,25 @@ def extract_process_parameters(headerlines):
     return process_parameters
 
 
-def extract_comments(dfraw):
+def extract_comments(dfraw: pandas.DataFrame, references: pandas.DataFrame) -> pandas.DataFrame:
+    """This adds cycle numbers using timestamps from references."""
     ocol_ncol_type = [
         ("TIME [h]", "time", float),
         ("COMMENTS", "user_comment", str),
     ]
     df = utils.__to_typed_cols__(dfraw[dfraw["READING"] == "K"], ocol_ncol_type)
+
+    # Get the times when each cycle started
+    start_times = references.reset_index().drop_duplicates("cycle", keep="first").set_index("cycle").time
+    start_times.loc[1] = 0
+    # Add cycle numbers based on cycle start times and comment timestamps
+    df["cycle"] = [start_times[t > start_times].index[-1] for t in df["time"]]
+
     # TODO: automatically separate comments into user/sys
     df["sys_comment"] = None
     df.index = range(len(df))
-    return df
+    # Change column order
+    return df[["cycle", "time", "user_comment", "sys_comment"]]
 
 
 def extract_references(dfraw):

diff --git a/bletl/parsing/blpro.py b/bletl/parsing/blpro.py
@@ -8,7 +8,7 @@
 import re
 import warnings
 import xml.etree.ElementTree
-from typing import Optional, Union
+from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union
 
 import numpy
 import pandas
@@ -20,6 +20,7 @@
     BLDParser,
     FilterTimeSeries,
     FluidicsSource,
+    IncompatibleFileError,
     InvalidLotNumberError,
 )
 
@@ -128,26 +129,29 @@ def _parse_datalines(datalines) -> pandas.DataFrame:
     return dfraw
 
 
-def parse_metadata_data(fp):
+def parse_metadata_data(fp) -> Tuple[Dict[str, Any], pandas.DataFrame]:
     with open(fp, "r", encoding="utf-8") as f:
         lines = f.readlines()
 
-    metadata = collections.defaultdict(dict)
-    datalines = collections.defaultdict(list)
+    metadata: DefaultDict[str, Any] = collections.defaultdict(dict)
     section = None
-    data_start = None
+    data_start: Optional[int] = None
 
     for l, line in enumerate(lines):
         if line.startswith("="):
             # any section header encountered
             section = line.strip().strip("=").strip()
             if not data_start and section == "data":
                 data_start = l + 1
+        elif section is None:
+            raise IncompatibleFileError("No metadata section header before first setting.")
         elif line.startswith("["):
             # register the value
             key, value = line.split("]")
             key = key.strip("[")
             metadata[section][key] = value.strip()
+    if data_start is None:
+        raise IncompatibleFileError("Section header 'data' not found.")
 
     # standardize the metadata keys
     metadata["date_start"] = datetime.datetime.strptime(
@@ -195,7 +199,7 @@ def parse_metadata_data(fp):
             f"{fp} contains defects in lines {defect_lines}. Be extra skeptical about the parsed results."
         )
 
-    return metadata, dfraw[list(dfraw.columns)[:-1]]
+    return dict(metadata), dfraw[list(dfraw.columns)[:-1]]
 
 
 def standardize(df):
@@ -258,6 +262,7 @@ def extract_filtersets(metadata):
 
 def extract_comments(dfraw):
     ocol_ncol_type = [
+        ("Cycle", "cycle", int),
         ("Time", "time", float),
         ("User_Comment", "user_comment", str),
         ("Sys_Comment", "sys_comment", str),
@@ -315,6 +320,22 @@ def extract_measurements(dfraw):
         )
     df_M = df_M[~mask]
 
+    # Drop filtersets with non-monotonically increasing time
+    drop_idxs = []
+    for idx, fsblock in df_M.groupby(["Cycle", "Filterset"]):
+        t = fsblock["Time"].astype(int).to_numpy()
+        if any(t[1:] < t[:-1]):
+            drop_idxs.append(idx)
+    ndrop = len(drop_idxs)
+    if ndrop:
+        for dropC, dropF in drop_idxs:
+            mask = numpy.logical_and(df_M["Cycle"] == dropC, df_M["Filterset"] == dropF)
+            df_M = df_M[~mask]
+            warnings.warn(
+                f"Dropped cycle {dropC} filterset {dropF} because of non-monotonically increasing time values.",
+                UserWarning,
+            )
+
     # Convert to the expected data types
     df = utils.__to_typed_cols__(df_M, ocol_ncol_type)
     df = df.set_index(["filterset", "cycle", "well"])