From 1de8a717f20336e689d5db3901d35ca6d635dc45 Mon Sep 17 00:00:00 2001 From: David McManamon Date: Thu, 12 Sep 2024 09:20:01 -0400 Subject: [PATCH 1/3] Add ONT flowcell to the output summary.csv file --- scripts/ont_stats.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/scripts/ont_stats.py b/scripts/ont_stats.py index aeb384c..81c2be5 100644 --- a/scripts/ont_stats.py +++ b/scripts/ont_stats.py @@ -4,6 +4,7 @@ import glob import os from collections import OrderedDict +import re # TODO get barcode info from lims # check if the run is pooled @@ -14,7 +15,7 @@ def if_pooled(sequencing_summary_df): return pooled # get stats metric if the run is not pooled -def get_read_length_and_summary(sequencing_summary_df): +def get_read_length_and_summary(sequencing_summary_df, flowcell): read_length = sequencing_summary_df[sequencing_summary_df["passes_filtering"]]["sequence_length_template"].tolist() if len(read_length) != 0: read_length.sort(reverse = True) @@ -30,10 +31,10 @@ def get_read_length_and_summary(sequencing_summary_df): median = 0 N50_value = 0 N50 = 0 - return(len(read_length), N50_value * 2 / 1000000000, N50, median) + return(len(read_length), N50_value * 2 / 1000000000, N50, median, flowcell) # get stats metric if the run is pooled -def get_read_length_and_summary_pooled(sequencing_summary_df, sample_name): +def get_read_length_and_summary_pooled(sequencing_summary_df, sample_name, flowcell): sample_dict = {} samples = sequencing_summary_df["barcode_arrangement"].unique() for sample in samples: @@ -45,13 +46,21 @@ def get_read_length_and_summary_pooled(sequencing_summary_df, sample_name): sample_dict[sample_sub] = get_read_length_and_summary(sample_df) return sample_dict +def extract_flowcell(text): + # Regular expression to match the characters after 'sequencing_summary_' and before the next '_' + match = re.search(r'sequencing_summary_([^_]+)', text) + if match: + return match.group(1) + else: + return None + def write_to_csv(sample_dict): file_name = "summary.csv" print("Writing stats file: " + file_name) with open(file_name,'w') as file: - file.write("sample_id, Reads, Bases, N50, Meidan Read Length\n") + file.write("sample_id, Reads, Bases, N50, Median Read Length, Flowcell\n") for key, value in sample_dict.items(): - file.write("{}, {}, {}, {}, {}\n".format(key, value[0], value[1], value[2], value[3])) + file.write("{}, {}, {}, {}, {}, {}\n".format(key, value[0], value[1], value[2], value[3], value[4])) if __name__ == '__main__': # Usage: python ont_stats.py [project_directory] @@ -70,16 +79,19 @@ def write_to_csv(sample_dict): file_count = 0 for i in file: file_count += 1 + flowcell = extract_flowcell(i) + print("Processing file: " + i + " from flowcell: " + flowcell) summary_matrix = pd.read_csv(i, delimiter = "\t") pooled = if_pooled(summary_matrix) # give different sample name for multi runs on one flow cell if file_count != 1: sample = sample + "_" + str(file_count) if pooled: - sample_dict_sub = get_read_length_and_summary_pooled(summary_matrix, sample) + sample_dict_sub = get_read_length_and_summary_pooled(summary_matrix, sample, flowcell) sample_dict.update(sample_dict_sub) else: - sample_dict[sample] = get_read_length_and_summary(summary_matrix) + sample_dict[sample] = get_read_length_and_summary(summary_matrix, flowcell) + print(sample_dict) write_to_csv(sample_dict) print("ONT stats complete for: " + project_directory) From 4a69d464a6167d58dc2cecfa89b74480961cec0a Mon Sep 17 00:00:00 2001 From: David McManamon Date: Thu, 26 Sep 2024 13:32:59 -0400 Subject: [PATCH 2/3] Store ONT stats to LIMS --- scripts/ont_stats.py | 56 +++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/scripts/ont_stats.py b/scripts/ont_stats.py index 81c2be5..8fe60d9 100644 --- a/scripts/ont_stats.py +++ b/scripts/ont_stats.py @@ -1,4 +1,5 @@ import pandas as pd +import requests import statistics import sys import glob @@ -15,7 +16,7 @@ def if_pooled(sequencing_summary_df): return pooled # get stats metric if the run is not pooled -def get_read_length_and_summary(sequencing_summary_df, flowcell): +def get_read_length_and_summary(sequencing_summary_df, flowcell, position): read_length = sequencing_summary_df[sequencing_summary_df["passes_filtering"]]["sequence_length_template"].tolist() if len(read_length) != 0: read_length.sort(reverse = True) @@ -31,19 +32,19 @@ def get_read_length_and_summary(sequencing_summary_df, flowcell): median = 0 N50_value = 0 N50 = 0 - return(len(read_length), N50_value * 2 / 1000000000, N50, median, flowcell) + return(len(read_length), N50_value * 2 / 1000000000, N50, median, flowcell, position) # get stats metric if the run is pooled -def get_read_length_and_summary_pooled(sequencing_summary_df, sample_name, flowcell): +def get_read_length_and_summary_pooled(sequencing_summary_df, sample_name, flowcell, position): sample_dict = {} samples = sequencing_summary_df["barcode_arrangement"].unique() for sample in samples: sample_df = sequencing_summary_df.loc[sequencing_summary_df['barcode_arrangement'] == sample] sample_sub = sample_name + "_" + sample - stats = get_read_length_and_summary(sample_df) + stats = get_read_length_and_summary(sample_df, flowcell, position) # only record barcodes with more than 10000 reads if stats[0] > 10000: - sample_dict[sample_sub] = get_read_length_and_summary(sample_df) + sample_dict[sample_sub] = get_read_length_and_summary(sample_df, flowcell, position) return sample_dict def extract_flowcell(text): @@ -54,13 +55,12 @@ def extract_flowcell(text): else: return None -def write_to_csv(sample_dict): - file_name = "summary.csv" +def write_to_csv(sample_dict, file_name): print("Writing stats file: " + file_name) with open(file_name,'w') as file: - file.write("sample_id, Reads, Bases, N50, Median Read Length, Flowcell\n") + file.write("sample_id, Reads, Bases, N50, Median Read Length, Flowcell, Position\n") for key, value in sample_dict.items(): - file.write("{}, {}, {}, {}, {}, {}\n".format(key, value[0], value[1], value[2], value[3], value[4])) + file.write("{}, {}, {}, {}, {}, {}, {}\n".format(key, value[0], value[1], value[2], value[3], value[4], value[5])) if __name__ == '__main__': # Usage: python ont_stats.py [project_directory] @@ -79,19 +79,47 @@ def write_to_csv(sample_dict): file_count = 0 for i in file: file_count += 1 + position = i.split("/")[-2].split("_")[2] flowcell = extract_flowcell(i) - print("Processing file: " + i + " from flowcell: " + flowcell) + print("Processing file: " + i + " from flowcell: " + flowcell + " at position:" + position) summary_matrix = pd.read_csv(i, delimiter = "\t") pooled = if_pooled(summary_matrix) # give different sample name for multi runs on one flow cell if file_count != 1: sample = sample + "_" + str(file_count) if pooled: - sample_dict_sub = get_read_length_and_summary_pooled(summary_matrix, sample, flowcell) + sample_dict_sub = get_read_length_and_summary_pooled(summary_matrix, sample, flowcell, position) sample_dict.update(sample_dict_sub) else: - sample_dict[sample] = get_read_length_and_summary(summary_matrix, flowcell) + sample_dict[sample] = get_read_length_and_summary(summary_matrix, flowcell, position) print(sample_dict) - write_to_csv(sample_dict) - print("ONT stats complete for: " + project_directory) + print(sample_dict) + write_to_csv(sample_dict, "summary.csv") + print("ONT stats .csv complete for: " + project_directory) + + # List of parameter names corresponding to the values skipping columns "estimatedCoverage", "bamCoverage", "sequencerName" + parameter_names = ["reads", "bases", "N50", "medianReadLength", "flowcell", "sequencerPosition", "igoId"] + + # Convert initial dictionary to a nested dictionary with parameter names + converted_sample_dict = {} + for key, values in sample_dict.items(): + # Create a nested dictionary by zipping parameter names and values + converted_sample_dict[key] = dict(zip(parameter_names, values.append(key))) + print(converted_sample_dict) + + # Write to LIMS endpoint with a GET: + # /LimsRest/updateLimsSampleLevelSequencingQcONT?igoId=04540_U_26_1_1_1_1_1&flowcell=PAY61078&reads=19775442&bases=9103016668&N50=16508&medianReadLength=766&estimatedCoverage=0&bamCoverage=0&sequencerPosition=1A&sequencerName=zeppelin + LIMS_ENDPOINT="https://igo-lims02.mskcc.org:8443/LimsRest/updateLimsSampleLevelSequencingQcONT" + for sample_id, params in converted_sample_dict.items(): + # Send GET request for each set of parameters + print("Sending LIMS get request for: " + params) + response = requests.get(LIMS_ENDPOINT, params=params, verify=False) + + # Check the response status and print the output + if response.status_code == 200: + print(f"Request for {sample_id} successful!") + print("Response Data:", response.json()) + else: + print(f"Request for {sample_id} failed with status code {response.status_code}") + print("Error details:", response.text) \ No newline at end of file From 023fcebe51218027db22348c39321979a754176c Mon Sep 17 00:00:00 2001 From: David McManamon Date: Thu, 26 Sep 2024 14:08:44 -0400 Subject: [PATCH 3/3] change how igoId is appended to the array --- scripts/ont_stats.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/ont_stats.py b/scripts/ont_stats.py index 8fe60d9..37d743a 100644 --- a/scripts/ont_stats.py +++ b/scripts/ont_stats.py @@ -105,7 +105,8 @@ def write_to_csv(sample_dict, file_name): converted_sample_dict = {} for key, values in sample_dict.items(): # Create a nested dictionary by zipping parameter names and values - converted_sample_dict[key] = dict(zip(parameter_names, values.append(key))) + values = values + (key,) + converted_sample_dict[key] = dict(zip(parameter_names, values)) print(converted_sample_dict) # Write to LIMS endpoint with a GET: