From b746830938891d6427d98ae2aca4338f9d385ce7 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 22 Jan 2025 11:43:01 +0100 Subject: [PATCH 1/7] base processing of run dirs on pattern matching for all platforms and remove superfluous .abspath method --- taca/utils/bioinfo_tab.py | 50 +++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/taca/utils/bioinfo_tab.py b/taca/utils/bioinfo_tab.py index 14e68681..5c16fcf1 100644 --- a/taca/utils/bioinfo_tab.py +++ b/taca/utils/bioinfo_tab.py @@ -27,9 +27,13 @@ def __init__(self, value=None): def collect_runs(): """Update command.""" found_runs = {"illumina": [], "element": []} + # Pattern explained: # 6-8Digits_(maybe ST-)AnythingLetterornumberNumber_Number_AorBLetterornumberordash illumina_rundir_re = re.compile("\d{6,8}_[ST-]*\w+\d+_\d+_[AB]?[A-Z0-9\-]+") + # E.g. 20250121_AV242106_B2425434199 + element_rundir_re = re.compile("\d{8}_AV242106_[AB]\d+") + for inst_brand in CONFIG["bioinfo_tab"]["data_dirs"]: for data_dir in CONFIG["bioinfo_tab"]["data_dirs"][inst_brand]: if os.path.exists(data_dir): @@ -37,25 +41,21 @@ def collect_runs(): for run_dir in potential_run_dirs: if os.path.isdir(run_dir): if inst_brand == "illumina" and illumina_rundir_re.match( - os.path.basename(os.path.abspath(run_dir)) + os.path.basename(run_dir) ): found_runs[inst_brand].append(os.path.basename(run_dir)) logger.info(f"Working on {run_dir}") update_statusdb(run_dir, inst_brand) - elif inst_brand == "element": - # Skip no sync dirs, they will be checked below - if run_dir == os.path.join(data_dir, "nosync"): - continue + elif inst_brand == "element" and element_rundir_re.match( + os.path.basename(run_dir) + ): + logger.info(f"Working on {run_dir}") + update_statusdb(run_dir, inst_brand) + elif inst_brand == "ont" and ONT_RUN_PATTERN.match( + os.path.basename(run_dir) + ): logger.info(f"Working on {run_dir}") update_statusdb(run_dir, inst_brand) - elif inst_brand == "ont": - # Skip archived, no_backup, nosync and qc folders - if re.match( - ONT_RUN_PATTERN, - os.path.basename(os.path.abspath(run_dir)), - ): - logger.info(f"Working on {run_dir}") - update_statusdb(run_dir, inst_brand) nosync_data_dir = os.path.join(data_dir, "nosync") potential_nosync_run_dirs = glob.glob( @@ -64,21 +64,26 @@ def collect_runs(): for run_dir in potential_nosync_run_dirs: if os.path.isdir(run_dir): if ( - inst_brand == "illumina" - and illumina_rundir_re.match( - os.path.basename(os.path.abspath(run_dir)) + ( + inst_brand == "illumina" + and illumina_rundir_re.match(os.path.basename(run_dir)) ) - ) or (inst_brand == "element" or inst_brand == "ont"): - # Skip archived dirs - if run_dir == os.path.join(nosync_data_dir, "archived"): - continue + or ( + inst_brand == "element" + and element_rundir_re.match(os.path.basename(run_dir)) + ) + or ( + inst_brand == "ont" + and ONT_RUN_PATTERN.match(os.path.basename(run_dir)) + ) + ): update_statusdb(run_dir, inst_brand) def update_statusdb(run_dir, inst_brand): """Gets status for a project.""" if inst_brand == "illumina": - run_id = os.path.basename(os.path.abspath(run_dir)) + run_id = os.path.basename(run_dir) elif inst_brand == "element": try: aviti_run = Aviti_Run(run_dir, CONFIG) @@ -89,7 +94,6 @@ def update_statusdb(run_dir, inst_brand): # WARNING - Run parameters file not found for ElementRun(), might not be ready yet return elif inst_brand == "ont": - run_dir = os.path.abspath(run_dir) try: ont_run = ONT_run(run_dir) except AssertionError as e: @@ -320,7 +324,7 @@ def get_ss_projects_illumina(run_dir): proj_tree = Tree() lane_pattern = re.compile("^([1-8]{1,2})$") sample_proj_pattern = re.compile("^((P[0-9]{3,5})_[0-9]{3,5})") - run_name = os.path.basename(os.path.abspath(run_dir)) + run_name = os.path.basename(run_dir) run_date = run_name.split("_")[0] if len(run_date) == 6: current_year = "20" + run_date[0:2] From 14955e8746b40fa14b408ce66300b86a6685e285 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 22 Jan 2025 11:43:24 +0100 Subject: [PATCH 2/7] remove unused dict --- taca/utils/bioinfo_tab.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/taca/utils/bioinfo_tab.py b/taca/utils/bioinfo_tab.py index 5c16fcf1..a4c5ef01 100644 --- a/taca/utils/bioinfo_tab.py +++ b/taca/utils/bioinfo_tab.py @@ -26,7 +26,6 @@ def __init__(self, value=None): def collect_runs(): """Update command.""" - found_runs = {"illumina": [], "element": []} # Pattern explained: # 6-8Digits_(maybe ST-)AnythingLetterornumberNumber_Number_AorBLetterornumberordash @@ -43,7 +42,6 @@ def collect_runs(): if inst_brand == "illumina" and illumina_rundir_re.match( os.path.basename(run_dir) ): - found_runs[inst_brand].append(os.path.basename(run_dir)) logger.info(f"Working on {run_dir}") update_statusdb(run_dir, inst_brand) elif inst_brand == "element" and element_rundir_re.match( From 115849a743ea9878ff0dc6e23c4a6086592b92ca Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 22 Jan 2025 11:52:36 +0100 Subject: [PATCH 3/7] simplify logic --- taca/utils/bioinfo_tab.py | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/taca/utils/bioinfo_tab.py b/taca/utils/bioinfo_tab.py index a4c5ef01..77b3669c 100644 --- a/taca/utils/bioinfo_tab.py +++ b/taca/utils/bioinfo_tab.py @@ -37,29 +37,9 @@ def collect_runs(): for data_dir in CONFIG["bioinfo_tab"]["data_dirs"][inst_brand]: if os.path.exists(data_dir): potential_run_dirs = glob.glob(os.path.join(data_dir, "*")) - for run_dir in potential_run_dirs: - if os.path.isdir(run_dir): - if inst_brand == "illumina" and illumina_rundir_re.match( - os.path.basename(run_dir) - ): - logger.info(f"Working on {run_dir}") - update_statusdb(run_dir, inst_brand) - elif inst_brand == "element" and element_rundir_re.match( - os.path.basename(run_dir) - ): - logger.info(f"Working on {run_dir}") - update_statusdb(run_dir, inst_brand) - elif inst_brand == "ont" and ONT_RUN_PATTERN.match( - os.path.basename(run_dir) - ): - logger.info(f"Working on {run_dir}") - update_statusdb(run_dir, inst_brand) + potential_run_dirs += glob.glob(os.path.join(data_dir, "nosync", "*")) - nosync_data_dir = os.path.join(data_dir, "nosync") - potential_nosync_run_dirs = glob.glob( - os.path.join(nosync_data_dir, "*") - ) - for run_dir in potential_nosync_run_dirs: + for run_dir in potential_run_dirs: if os.path.isdir(run_dir): if ( ( @@ -75,6 +55,7 @@ def collect_runs(): and ONT_RUN_PATTERN.match(os.path.basename(run_dir)) ) ): + logger.info(f"Working on {run_dir}") update_statusdb(run_dir, inst_brand) From 060516033abd8644df9febb4f8cdc3c513b17198 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 22 Jan 2025 11:55:46 +0100 Subject: [PATCH 4/7] vlog --- VERSIONLOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 9343f670..dbc2522f 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,6 +1,10 @@ # TACA Version Log -## 20241216.1 +## 20250122.1 + +Improve the way TACA identifies run dirs in the "bioinfo_deliveries --update" command (bioinfo_tab.py). + +## 20241216.2 Do not run ToulligQC if its output directory can be found. From 8b725296f85cc5d55fcf486db5f6e5ab434f3ae5 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 22 Jan 2025 11:59:23 +0100 Subject: [PATCH 5/7] ruff format --- taca/cleanup/cleanup.py | 8 ++++---- taca/element/Element_Runs.py | 8 ++++---- taca/nanopore/ONT_run_classes.py | 12 ++++++------ taca/utils/misc.py | 2 +- taca/utils/statusdb.py | 3 +-- taca/utils/transfer.py | 2 +- tests/nanopore/test_ONT_run_classes.py | 2 +- tests/nanopore/test_instrument_transfer.py | 4 ++-- 8 files changed, 20 insertions(+), 21 deletions(-) diff --git a/taca/cleanup/cleanup.py b/taca/cleanup/cleanup.py index e6958205..799fa8c6 100644 --- a/taca/cleanup/cleanup.py +++ b/taca/cleanup/cleanup.py @@ -571,13 +571,13 @@ def _def_get_size_unit(s): gb = mb * 1000 tb = gb * 1000 if s > tb: - s = f"~{int(s/tb)}tb" + s = f"~{int(s / tb)}tb" elif s > gb: - s = f"~{int(s/gb)}gb" + s = f"~{int(s / gb)}gb" elif s > mb: - s = f"~{int(s/mb)}mb" + s = f"~{int(s / mb)}mb" elif s > kb: - s = f"~{int(s/kb)}kb" + s = f"~{int(s / kb)}kb" elif s > 0: s = f"~{int(s)}b" return str(s) diff --git a/taca/element/Element_Runs.py b/taca/element/Element_Runs.py index 5e1c2716..84dbf3ca 100644 --- a/taca/element/Element_Runs.py +++ b/taca/element/Element_Runs.py @@ -437,9 +437,9 @@ def make_demux_manifests( # Get '[SAMPLES]' section split_contents = manifest_contents.split("[SAMPLES]") - assert ( - len(split_contents) == 2 - ), f"Could not split sample rows out of manifest {manifest_contents}" + assert len(split_contents) == 2, ( + f"Could not split sample rows out of manifest {manifest_contents}" + ) sample_section = split_contents[1].strip().split("\n") # Split into header and rows @@ -560,7 +560,7 @@ def make_demux_manifests( "[RUNVALUES]", "KeyName, Value", f"manifest_file, {file_name}", - f"manifest_group, {n+1}/{len(grouped_df)}", + f"manifest_group, {n + 1}/{len(grouped_df)}", f"built_from, {manifest_to_split}", ] ) diff --git a/taca/nanopore/ONT_run_classes.py b/taca/nanopore/ONT_run_classes.py index 808417c2..f26904e2 100644 --- a/taca/nanopore/ONT_run_classes.py +++ b/taca/nanopore/ONT_run_classes.py @@ -37,9 +37,9 @@ def __init__(self, run_abspath: str): None # This will be defined upon instantiation of a child class ) - assert re.match( - ONT_RUN_PATTERN, self.run_name - ), f"Run {self.run_name} doesn't look like a run dir" + assert re.match(ONT_RUN_PATTERN, self.run_name), ( + f"Run {self.run_name} doesn't look like a run dir" + ) # Parse MinKNOW sample and experiment name with open(self.get_file("/run_path.txt")) as stream: @@ -143,9 +143,9 @@ def touch_db_entry(self): pore_count_history_file = os.path.join( self.run_abspath, "pore_count_history.csv" ) - assert os.path.isfile( - pore_count_history_file - ), f"Couldn't find {pore_count_history_file}" + assert os.path.isfile(pore_count_history_file), ( + f"Couldn't find {pore_count_history_file}" + ) self.db.create_ongoing_run(self, run_path_file, pore_count_history_file) logger.info( diff --git a/taca/utils/misc.py b/taca/utils/misc.py index 8b443014..3dc52f91 100755 --- a/taca/utils/misc.py +++ b/taca/utils/misc.py @@ -176,7 +176,7 @@ def query_yes_no(question, default="yes", force=False): elif choice in valid: return valid[choice] else: - sys.stdout.write('Please respond with "yes" or "no" ' '(or "y" or "n").\n') + sys.stdout.write('Please respond with "yes" or "no" (or "y" or "n").\n') def return_unique(seq): diff --git a/taca/utils/statusdb.py b/taca/utils/statusdb.py index a2920550..85635864 100644 --- a/taca/utils/statusdb.py +++ b/taca/utils/statusdb.py @@ -225,8 +225,7 @@ def merge_dicts(d1, d2): pass # same leaf value else: logger.debug( - f"Values for key {key} in d1 and d2 differ, " - "using the value of d1" + f"Values for key {key} in d1 and d2 differ, using the value of d1" ) else: d1[key] = d2[key] diff --git a/taca/utils/transfer.py b/taca/utils/transfer.py index 8a5bf311..8456912d 100644 --- a/taca/utils/transfer.py +++ b/taca/utils/transfer.py @@ -269,7 +269,7 @@ def transfer(self): # If we are not overwriting, return False if not self.overwrite: logger.debug( - f'target "{self.dest_path}" exists and will not be ' "overwritten" + f'target "{self.dest_path}" exists and will not be overwritten' ) return False # If the target is a mount, let's not mess with it diff --git a/tests/nanopore/test_ONT_run_classes.py b/tests/nanopore/test_ONT_run_classes.py index 6e9d1d84..305f34b5 100644 --- a/tests/nanopore/test_ONT_run_classes.py +++ b/tests/nanopore/test_ONT_run_classes.py @@ -171,7 +171,7 @@ def create_ONT_run_dir( "unknown_positive", "zero", ]: - f.write(f"{state},{i},{i*100}\n") + f.write(f"{state},{i},{i * 100}\n") if sync_finished: open(f"{run_path}/.sync_finished", "w").close() diff --git a/tests/nanopore/test_instrument_transfer.py b/tests/nanopore/test_instrument_transfer.py index ccede9fb..1c1270bf 100644 --- a/tests/nanopore/test_instrument_transfer.py +++ b/tests/nanopore/test_instrument_transfer.py @@ -398,7 +398,7 @@ def test_dump_pore_count_history(setup_test_fixture): # Nothing to add, no file tmp = tempfile.TemporaryDirectory() - run_path = tmp.name + f"/experiment/sample/{DUMMY_RUN_NAME.replace('TEST','FLG')}" + run_path = tmp.name + f"/experiment/sample/{DUMMY_RUN_NAME.replace('TEST', 'FLG')}" os.makedirs(run_path) new_file = instrument_transfer.dump_pore_count_history(run_path, pore_counts) assert open(new_file).read() == "" @@ -406,7 +406,7 @@ def test_dump_pore_count_history(setup_test_fixture): # Nothing to add, file is present tmp = tempfile.TemporaryDirectory() - run_path = tmp.name + f"/experiment/sample/{DUMMY_RUN_NAME.replace('TEST','FLG')}" + run_path = tmp.name + f"/experiment/sample/{DUMMY_RUN_NAME.replace('TEST', 'FLG')}" os.makedirs(run_path) open(run_path + "/pore_count_history.csv", "w").write("test") new_file = instrument_transfer.dump_pore_count_history(run_path, pore_counts) From d1d21f6a8318c1d3f7c9ca1b61d0461bda5da1d2 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 22 Jan 2025 12:01:35 +0100 Subject: [PATCH 6/7] vlog --- VERSIONLOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/VERSIONLOG.md b/VERSIONLOG.md index 9343f670..a46ae070 100644 --- a/VERSIONLOG.md +++ b/VERSIONLOG.md @@ -1,6 +1,10 @@ # TACA Version Log -## 20241216.1 +## 20250122.1 + +Ruff formatting. + +## 20241216.2 Do not run ToulligQC if its output directory can be found. From 0fc69ca63efd82d1edd38f192fec619c97b215de Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 22 Jan 2025 13:30:43 +0100 Subject: [PATCH 7/7] use abspath for potential manual rundir input --- taca/utils/bioinfo_tab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taca/utils/bioinfo_tab.py b/taca/utils/bioinfo_tab.py index 77b3669c..14d6613d 100644 --- a/taca/utils/bioinfo_tab.py +++ b/taca/utils/bioinfo_tab.py @@ -62,7 +62,7 @@ def collect_runs(): def update_statusdb(run_dir, inst_brand): """Gets status for a project.""" if inst_brand == "illumina": - run_id = os.path.basename(run_dir) + run_id = os.path.basename(os.path.abspath(run_dir)) elif inst_brand == "element": try: aviti_run = Aviti_Run(run_dir, CONFIG)