From c37fe98c19878d1cac2d0e223932e049c08dddab Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Fri, 2 Feb 2024 11:22:41 +0100 Subject: [PATCH 1/4] fix vcf_collect when fusioninspector output is empty but fusionreport is not --- bin/vcf_collect.py | 104 +++++++++++++++++++++++++++++++++------------ 1 file changed, 77 insertions(+), 27 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 2c25574f..9da5f5c4 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -43,7 +43,6 @@ def vcf_collect( .reset_index() ) hgnc_df = build_hgnc_dataframe(hgnc) - df_symbol = merged_df[merged_df["Left_ensembl_gene_id"].isna()] df_not_symbol = merged_df[merged_df["Left_ensembl_gene_id"].notna()] @@ -66,7 +65,9 @@ def vcf_collect( gtf_df = build_gtf_dataframe(gtf) all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id") - all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int) + all_df[["PosA", "orig_start", "orig_end"]] = ( + all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int) + ) all_df = all_df[ ((all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])) @@ -76,7 +77,9 @@ def vcf_collect( all_df.replace("", np.nan, inplace=True) all_df = all_df.drop_duplicates() - all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan) + all_df[["exon_number", "transcript_version"]] = all_df[ + ["exon_number", "transcript_version"] + ].replace(0, np.nan) # Fill non-empty values within each group for 'exon_number' and 'transcript_version' all_df["exon_number"] = all_df.groupby("PosA")["exon_number"].transform( lambda x: x.fillna(method="ffill").fillna(method="bfill") @@ -115,9 +118,14 @@ def vcf_collect( "annots", ] ].drop_duplicates() + all_df["CDS_RIGHT_ID"] = all_df["CDS_RIGHT_ID"].astype('str') all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id") - all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0) - all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna( + 0 + ) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype( + int + ) all_df = all_df[ ((all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])) | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0)) @@ -126,7 +134,9 @@ def vcf_collect( all_df[["PosA", "PosB"]] = all_df[["PosA", "PosB"]].replace(0, np.nan) all_df = all_df.replace("", np.nan) - all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan) + all_df[["exon_number", "transcript_version"]] = all_df[ + ["exon_number", "transcript_version"] + ].replace(0, np.nan) # Fill non-empty values within each group for 'exon_number' and 'transcript_version' all_df["exon_number"] = all_df.groupby("PosB")["exon_number"].transform( lambda x: x.fillna(method="ffill").fillna(method="bfill") @@ -212,7 +222,9 @@ def parse_args(argv=None): type=Path, help="HGNC database.", ) - parser.add_argument("--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample") + parser.add_argument( + "--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample" + ) parser.add_argument( "--out", metavar="OUT", @@ -272,15 +284,41 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: """ df = pd.read_csv(file, sep="\t") df = df.rename(columns={"#FusionName": "FUSION"}) - df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(":", expand=True) - df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True) - df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True) - df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True) - df["annots"] = ( - df["annots"] - .apply(convert_to_list) - .apply(lambda x: ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "") - ) + if not (df.empty): + df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(":", expand=True) + df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True) + df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True) + df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True) + df["annots"] = ( + df["annots"] + .apply(convert_to_list) + .apply( + lambda x: ",".join(map(str, x)) + if isinstance(x, list) + else str(x) + if pd.notna(x) + else "" + ) + ) + else: + for i in [ + "ChromosomeA", + "Strand1", + "ChromosomeB", + "Strand2", + "LeftGeneName", + "Left_ensembl_gene_id", + "RightGeneName", + "Right_ensembl_gene_id", + "annots", + ]: + df[i] = "" + for j in [ + "PosA", + "PosB", + ]: + df[j] = np.nan + return df.set_index(["FUSION"]) @@ -315,8 +353,8 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: """ with open(fusionreport_file) as f: from_html = [line.split('rows": [')[1] for line in f if 'name="fusion_list' in line] - expression = from_html[0].split('], "tool')[0] - fusion_report = pd.DataFrame.from_dict(ast.literal_eval(expression)) + expression = ast.literal_eval(from_html[0].split('], "tool')[0]) + fusion_report = pd.DataFrame.from_dict({k: [v] for k, v in expression.items()}) if not "arriba" in fusion_report.columns: fusion_report["arriba"] = "" if not "fusioncatcher" in fusion_report.columns: @@ -339,9 +377,9 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x)) fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True) - return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index( - ["FUSION"] - ) + return fusion_report[ + ["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"] + ].set_index(["FUSION"]) def read_fusionreport_csv(file: str) -> pd.DataFrame: @@ -350,7 +388,9 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame: for column in columns_to_iterate: if column not in df.columns: df[column] = "" - df[["starfusion", "arriba", "fusioncatcher"]] = df[["starfusion", "arriba", "fusioncatcher"]].astype("str") + df[["starfusion", "arriba", "fusioncatcher"]] = df[ + ["starfusion", "arriba", "fusioncatcher"] + ].astype("str") for index, row in df.iterrows(): for column in columns_to_iterate: cell_value = row[column] @@ -378,7 +418,9 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame: df[["GeneA", "GeneB"]] = df["Fusion"].str.split("--", expand=True) df = df.set_index("Fusion") df.to_csv("tmp.csv") - return df[["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"]] + return df[ + ["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"] + ] def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: @@ -405,7 +447,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str) df["Right_exon_number"] = df["Right_exon_number"].fillna(0).astype(int).astype(str) df["Left_transcript_version"] = df["Left_transcript_version"].fillna(0).astype(int).astype(str) - df["Right_transcript_version"] = df["Right_transcript_version"].fillna(0).astype(int).astype(str) + df["Right_transcript_version"] = ( + df["Right_transcript_version"].fillna(0).astype(int).astype(str) + ) df["PosA"] = df["PosA"].fillna(0).astype(int).astype(str) df["PosB"] = df["PosB"].fillna(0).astype(int).astype(str) df["PROT_FUSION_TYPE"] = df["PROT_FUSION_TYPE"].replace(".", "nan") @@ -432,7 +476,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: f"EXON_NUMBER_A={row['Left_exon_number']};EXON_NUMBER_B={row['Right_exon_number']};" f"ANNOTATIONS={row['annots']}" ) - df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" + df.loc[ + index, "Sample" + ] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" return df @@ -477,7 +523,9 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame: """ df = pd.read_csv(file, sep="\t") df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True) - df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(",", expand=True) + df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split( + ",", expand=True + ) return df[["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]] @@ -491,7 +539,9 @@ def main(argv=None): or not args.fusionreport_csv or not args.hgnc ): - logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!") + logger.error( + f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!" + ) sys.exit(2) vcf_collect( args.fusioninspector, From a6659dac892a335d5bfd4f2b810b4fabeef9b48b Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Fri, 2 Feb 2024 11:24:24 +0100 Subject: [PATCH 2/4] update changelog --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ef345a3..d592cefc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,11 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Update to nf-tools 2.11.1 +- Update to nf-tools 2.11.1 [#457] (https://github.com/nf-core/rnafusion/pull/457) ### Fixed -- fix VCF_COLLECT handling when a tool is absent from FUSIONREPORT report +- fix VCF_COLLECT handling when a tool is absent from FUSIONREPORT report [#458](https://github.com/nf-core/rnafusion/pull/458) +- fix VCF_COLLECT when fusioninspector output is empty but fusionreport is not ### Removed From b67a751bbffcef5a2f80c02f9ce4c073e8a4aa0d Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Fri, 2 Feb 2024 11:26:57 +0100 Subject: [PATCH 3/4] update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d592cefc..6468aff6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - fix VCF_COLLECT handling when a tool is absent from FUSIONREPORT report [#458](https://github.com/nf-core/rnafusion/pull/458) -- fix VCF_COLLECT when fusioninspector output is empty but fusionreport is not +- fix VCF_COLLECT when fusioninspector output is empty but fusionreport is not [#465](https://github.com/nf-core/rnafusion/pull/465) ### Removed From 26e492c3140ac3f6e43cd3dbd2ff332611cb511a Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Fri, 2 Feb 2024 11:35:18 +0100 Subject: [PATCH 4/4] black --- bin/vcf_collect.py | 64 ++++++++++++---------------------------------- 1 file changed, 17 insertions(+), 47 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 9da5f5c4..7e988c1e 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -65,9 +65,7 @@ def vcf_collect( gtf_df = build_gtf_dataframe(gtf) all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id") - all_df[["PosA", "orig_start", "orig_end"]] = ( - all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int) - ) + all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int) all_df = all_df[ ((all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])) @@ -77,9 +75,7 @@ def vcf_collect( all_df.replace("", np.nan, inplace=True) all_df = all_df.drop_duplicates() - all_df[["exon_number", "transcript_version"]] = all_df[ - ["exon_number", "transcript_version"] - ].replace(0, np.nan) + all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan) # Fill non-empty values within each group for 'exon_number' and 'transcript_version' all_df["exon_number"] = all_df.groupby("PosA")["exon_number"].transform( lambda x: x.fillna(method="ffill").fillna(method="bfill") @@ -118,14 +114,10 @@ def vcf_collect( "annots", ] ].drop_duplicates() - all_df["CDS_RIGHT_ID"] = all_df["CDS_RIGHT_ID"].astype('str') + all_df["CDS_RIGHT_ID"] = all_df["CDS_RIGHT_ID"].astype("str") all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id") - all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna( - 0 - ) - all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype( - int - ) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int) all_df = all_df[ ((all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])) | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0)) @@ -134,9 +126,7 @@ def vcf_collect( all_df[["PosA", "PosB"]] = all_df[["PosA", "PosB"]].replace(0, np.nan) all_df = all_df.replace("", np.nan) - all_df[["exon_number", "transcript_version"]] = all_df[ - ["exon_number", "transcript_version"] - ].replace(0, np.nan) + all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan) # Fill non-empty values within each group for 'exon_number' and 'transcript_version' all_df["exon_number"] = all_df.groupby("PosB")["exon_number"].transform( lambda x: x.fillna(method="ffill").fillna(method="bfill") @@ -222,9 +212,7 @@ def parse_args(argv=None): type=Path, help="HGNC database.", ) - parser.add_argument( - "--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample" - ) + parser.add_argument("--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample") parser.add_argument( "--out", metavar="OUT", @@ -292,13 +280,7 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: df["annots"] = ( df["annots"] .apply(convert_to_list) - .apply( - lambda x: ",".join(map(str, x)) - if isinstance(x, list) - else str(x) - if pd.notna(x) - else "" - ) + .apply(lambda x: ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "") ) else: for i in [ @@ -377,9 +359,9 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x)) fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True) - return fusion_report[ - ["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"] - ].set_index(["FUSION"]) + return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index( + ["FUSION"] + ) def read_fusionreport_csv(file: str) -> pd.DataFrame: @@ -388,9 +370,7 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame: for column in columns_to_iterate: if column not in df.columns: df[column] = "" - df[["starfusion", "arriba", "fusioncatcher"]] = df[ - ["starfusion", "arriba", "fusioncatcher"] - ].astype("str") + df[["starfusion", "arriba", "fusioncatcher"]] = df[["starfusion", "arriba", "fusioncatcher"]].astype("str") for index, row in df.iterrows(): for column in columns_to_iterate: cell_value = row[column] @@ -418,9 +398,7 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame: df[["GeneA", "GeneB"]] = df["Fusion"].str.split("--", expand=True) df = df.set_index("Fusion") df.to_csv("tmp.csv") - return df[ - ["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"] - ] + return df[["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"]] def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: @@ -447,9 +425,7 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str) df["Right_exon_number"] = df["Right_exon_number"].fillna(0).astype(int).astype(str) df["Left_transcript_version"] = df["Left_transcript_version"].fillna(0).astype(int).astype(str) - df["Right_transcript_version"] = ( - df["Right_transcript_version"].fillna(0).astype(int).astype(str) - ) + df["Right_transcript_version"] = df["Right_transcript_version"].fillna(0).astype(int).astype(str) df["PosA"] = df["PosA"].fillna(0).astype(int).astype(str) df["PosB"] = df["PosB"].fillna(0).astype(int).astype(str) df["PROT_FUSION_TYPE"] = df["PROT_FUSION_TYPE"].replace(".", "nan") @@ -476,9 +452,7 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: f"EXON_NUMBER_A={row['Left_exon_number']};EXON_NUMBER_B={row['Right_exon_number']};" f"ANNOTATIONS={row['annots']}" ) - df.loc[ - index, "Sample" - ] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" + df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" return df @@ -523,9 +497,7 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame: """ df = pd.read_csv(file, sep="\t") df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True) - df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split( - ",", expand=True - ) + df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(",", expand=True) return df[["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]] @@ -539,9 +511,7 @@ def main(argv=None): or not args.fusionreport_csv or not args.hgnc ): - logger.error( - f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!" - ) + logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!") sys.exit(2) vcf_collect( args.fusioninspector,