Skip to content

Commit

Permalink
Merge pull request #47 from JuBiotech/drop-nonmonotonic-time
Browse files Browse the repository at this point in the history
Drop filtersets with non-monotonic time
  • Loading branch information
bertramgeinitz authored Aug 7, 2023
2 parents 2006f0a + 167a139 commit bad4b7e
Show file tree
Hide file tree
Showing 5 changed files with 2,803 additions and 10 deletions.
2 changes: 1 addition & 1 deletion bletl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
NoMeasurementData,
)

__version__ = "1.4.0"
__version__ = "1.4.1"
15 changes: 12 additions & 3 deletions bletl/parsing/bl1.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ def parse(
metadata = extract_metadata(headerlines)
process_parameters = extract_process_parameters(headerlines)
filtersets = extract_filtersets(headerlines)
comments = extract_comments(data)
references = extract_references(data)
comments = extract_comments(data, references)
measurements = extract_measurements(data)

data = BLData(
Expand Down Expand Up @@ -377,16 +377,25 @@ def extract_process_parameters(headerlines):
return process_parameters


def extract_comments(dfraw):
def extract_comments(dfraw: pandas.DataFrame, references: pandas.DataFrame) -> pandas.DataFrame:
"""This adds cycle numbers using timestamps from references."""
ocol_ncol_type = [
("TIME [h]", "time", float),
("COMMENTS", "user_comment", str),
]
df = utils.__to_typed_cols__(dfraw[dfraw["READING"] == "K"], ocol_ncol_type)

# Get the times when each cycle started
start_times = references.reset_index().drop_duplicates("cycle", keep="first").set_index("cycle").time
start_times.loc[1] = 0
# Add cycle numbers based on cycle start times and comment timestamps
df["cycle"] = [start_times[t > start_times].index[-1] for t in df["time"]]

# TODO: automatically separate comments into user/sys
df["sys_comment"] = None
df.index = range(len(df))
return df
# Change column order
return df[["cycle", "time", "user_comment", "sys_comment"]]


def extract_references(dfraw):
Expand Down
33 changes: 27 additions & 6 deletions bletl/parsing/blpro.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import re
import warnings
import xml.etree.ElementTree
from typing import Optional, Union
from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union

import numpy
import pandas
Expand All @@ -20,6 +20,7 @@
BLDParser,
FilterTimeSeries,
FluidicsSource,
IncompatibleFileError,
InvalidLotNumberError,
)

Expand Down Expand Up @@ -128,26 +129,29 @@ def _parse_datalines(datalines) -> pandas.DataFrame:
return dfraw


def parse_metadata_data(fp):
def parse_metadata_data(fp) -> Tuple[Dict[str, Any], pandas.DataFrame]:
with open(fp, "r", encoding="utf-8") as f:
lines = f.readlines()

metadata = collections.defaultdict(dict)
datalines = collections.defaultdict(list)
metadata: DefaultDict[str, Any] = collections.defaultdict(dict)
section = None
data_start = None
data_start: Optional[int] = None

for l, line in enumerate(lines):
if line.startswith("="):
# any section header encountered
section = line.strip().strip("=").strip()
if not data_start and section == "data":
data_start = l + 1
elif section is None:
raise IncompatibleFileError("No metadata section header before first setting.")
elif line.startswith("["):
# register the value
key, value = line.split("]")
key = key.strip("[")
metadata[section][key] = value.strip()
if data_start is None:
raise IncompatibleFileError("Section header 'data' not found.")

# standardize the metadata keys
metadata["date_start"] = datetime.datetime.strptime(
Expand Down Expand Up @@ -195,7 +199,7 @@ def parse_metadata_data(fp):
f"{fp} contains defects in lines {defect_lines}. Be extra skeptical about the parsed results."
)

return metadata, dfraw[list(dfraw.columns)[:-1]]
return dict(metadata), dfraw[list(dfraw.columns)[:-1]]


def standardize(df):
Expand Down Expand Up @@ -258,6 +262,7 @@ def extract_filtersets(metadata):

def extract_comments(dfraw):
ocol_ncol_type = [
("Cycle", "cycle", int),
("Time", "time", float),
("User_Comment", "user_comment", str),
("Sys_Comment", "sys_comment", str),
Expand Down Expand Up @@ -315,6 +320,22 @@ def extract_measurements(dfraw):
)
df_M = df_M[~mask]

# Drop filtersets with non-monotonically increasing time
drop_idxs = []
for idx, fsblock in df_M.groupby(["Cycle", "Filterset"]):
t = fsblock["Time"].astype(int).to_numpy()
if any(t[1:] < t[:-1]):
drop_idxs.append(idx)
ndrop = len(drop_idxs)
if ndrop:
for dropC, dropF in drop_idxs:
mask = numpy.logical_and(df_M["Cycle"] == dropC, df_M["Filterset"] == dropF)
df_M = df_M[~mask]
warnings.warn(
f"Dropped cycle {dropC} filterset {dropF} because of non-monotonically increasing time values.",
UserWarning,
)

# Convert to the expected data types
df = utils.__to_typed_cols__(df_M, ocol_ncol_type)
df = df.set_index(["filterset", "cycle", "well"])
Expand Down
Loading

0 comments on commit bad4b7e

Please sign in to comment.