From 99aebe4d194c9cdc3c7eca67a79181b817c7dcb1 Mon Sep 17 00:00:00 2001 From: josephrich98 Date: Sun, 13 Oct 2024 15:50:52 -0700 Subject: [PATCH] simplified gget_mutate and ported over gget_mutate code to kvar --- docs/src/en/mutate.md | 112 +--------- gget/gget_mutate.py | 380 +++----------------------------- gget/main.py | 117 +--------- tests/fixtures/test_mutate.json | 112 +++------- 4 files changed, 64 insertions(+), 657 deletions(-) diff --git a/docs/src/en/mutate.md b/docs/src/en/mutate.md index b768bcc9..316a1b78 100644 --- a/docs/src/en/mutate.md +++ b/docs/src/en/mutate.md @@ -5,6 +5,8 @@ Return format: Saves mutated sequences in FASTA format (or returns a list contai This module was written by [Joseph Rich](https://github.com/josephrich98). +** Update: The more complex functionality of gget mutate has been ported to https://github.com/pachterlab/kvar. kvar expands on this functionality in the context of screening for variants/mutations in sequencing data. If this sounds interesting to you, please check it out! ** + **Positional argument** `sequences` Path to the FASTA file containing the sequences to be mutated, e.g., 'path/to/seqs.fa'. @@ -53,60 +55,12 @@ Name of the column containing the IDs of the sequences to be mutated in `mutatio `-mic` `--mut_id_column` Name of the column containing the IDs of each mutation in `mutations`. Default: Same as `mut_column`. - -`-gtf` `--gtf` -Path to a .gtf file. When providing a genome fasta file as input for 'sequences', you can provide a .gtf file here and the input sequences will be defined according to the transcript boundaries, e.g. 'path/to/genome_annotation.gtf'. Default: None - -`-gtic` `--gtf_transcript_id_column` -Column name in the input `mutations` file containing the transcript ID. In this case, column `seq_id_column` should contain the chromosome number. -Required when `gtf` is provided. Default: None **Optional mutant sequence generation/filtering arguments** `-k` `--k` Length of sequences flanking the mutation. Default: 30. If k > total length of the sequence, the entire sequence will be kept. -`-msl` `--min_seq_len` -Minimum length of the mutant output sequence, e.g. 100. Mutant sequences smaller than this will be dropped. Default: None - -`-ma` `--max_ambiguous` -Maximum number of 'N' (or 'n') characters allowed in the output sequence, e.g. 10. Default: None (no ambiguous character filter will be applied) - -**Optional mutant sequence generation/filtering flags** -`-ofr` `--optimize_flanking_regions` -Removes nucleotides from either end of the mutant sequence to ensure (when possible) that the mutant sequence does not contain any k-mers also found in the wildtype/input sequence. - -`-rswk` `--remove_seqs_with_wt_kmers` -Removes output sequences where at least one k-mer is also present in the wildtype/input sequence in the same region. -When used with `--optimize_flanking_regions`, only sequences for which a wildtype k-mer is still present after optimization will be removed. - -`-mio` `--merge_identical_off` -Do not merge identical mutant sequences in the output (by default, identical sequences will be merged by concatenating the sequence headers for all identical sequences). - -**Optional arguments to generate additional output** -This output is activated using the `--update_df` flag and will be stored in a copy of the `mutations` DataFrame. - -`-udf_o` `--update_df_out` -Path to output csv file containing the updated DataFrame, e.g. 'path/to/mutations_updated.csv'. Only valid when used with `--update_df`. -Default: None -> the new csv file will be saved in the same directory as the `mutations` DataFrame with appendix '_updated' - -`-ts` `--translate_start` -(int or str) The position in the input nucleotide sequence to start translating, e.g. 5. If a string is provided, it should correspond to a column name in `mutations` containing the open reading frame start positions for each sequence/mutation. Only valid when used with `--translate`. -Default: translates from the beginning of each sequence - -`-te` `--translate_end` -(int or str) The position in the input nucleotide sequence to end translating, e.g. 35. If a string is provided, it should correspond to a column name in `mutations` containing the open reading frame end positions for each sequence/mutation. Only valid when used with `--translate`. -Default: translates until the end of each sequence - -**Optional flags to modify additional output** -`-udf` `--update_df` -Updates the input `mutations` DataFrame to include additional columns with the mutation type, wildtype nucleotide sequence, and mutant nucleotide sequence (only valid if `mutations` is a .csv or .tsv file). - -`-sfs` `--store_full_sequences` -Includes the complete wildtype and mutant sequences in the updated `mutations` DataFrame (not just the sub-sequence with k-length flanks). Only valid when used with `--update_df`. - -`-tr` `--translate` -Adds additional columns to the updated `mutations` DataFrame containing the wildtype and mutant amino acid sequences. Only valid when used with `--store_full_sequences`. **Optional general arguments** `-o` `--out` @@ -160,64 +114,4 @@ gget.mutate(["ATCGCTAAGCT", "TAGCTA"], "c.1_3inv", k=3) → Returns ['CTAGCT', 'GATCTA']. -

- -**Add mutations to an entire genome with extended output** -Main input: -- mutation information as a `mutations` CSV (by having `seq_id_column` contain chromosome information, and `mut_column` contain mutation information with respect to genome coordinates) -- the genome as the `sequences` file - -Since we are passing the path to a gtf file to the `gtf` argument, transcript boundaries will be respected (the genome will be split into transcripts). `gtf_transcript_id_column` specifies the name of the column in `mutations` containing the transcript IDs corresponding to the transcript IDs in the `gtf` file. - -The `optimize_flanking_regions` argument maximizes the length of the resulting mutation-containing sequences while maintaining specificity (no wildtype k-mer will be retained). - -`update_df` activates the creation of a new CSV file with updated information about each input and output sequence. This new CSV file will be saved as `update_df_out`. Since `store_full_sequences` is activated, this new CSV file will not only contain the output sequences (restricted in size by flanking regiong of size `k`), but also the complete input and output sequences. This allows us to observe the mutation in the context of the entire sequence. Lastly, we are also adding the translated versions of the complete sequences by adding the with the `translate` flag, so we can observe how the resulting amino acid sequence is changed. The `translate_start` and `translate_end` arguments specify the names of the columns in `mutations` that contain the start and end positions of the open reading frame (start and end positions for translating the nucleotide sequence to an amino acid sequence), respectively. - -```bash -gget mutate \ - -m mutations_input.csv \ - -o mut_fasta.fa \ - -k 4 \ - -sic Chromosome \ - -mic Mutation \ - -gtf genome_annotation.gtf \ - -gtic Ensembl_Transcript_ID \ - -ofr \ - -update_df \ - -udf_o mutations_updated.csv \ - -sfs \ - -tr \ - -ts Translate_Start \ - -te Translate_End \ - genome_reference.fa -``` -```python -# Python -gget.mutate(sequences="genome_reference.fa", mutations="mutations_input.csv", out="mut_fasta.fa", k=4, seq_id_column="Chromosome", mut_column="Mutation", gtf="genome_annotation.gtf", gtf_transcript_id_column="Ensembl_Transcript_ID", optimize_flanking_regions=True, update_df=True, update_df_out="mutations_updated.csv", store_full_sequences=True, translate=True, translate_start="Translate_Start", translate_end="Translate_End") -``` -→ Takes as input 'mutations_input.csv' file containing: -``` -| Chromosome | Mutation | Ensembl_Transcript_ID | Translate_Start | Translate_End | -|------------|-------------------|------------------------|-----------------|---------------| -| 1 | g.224411A>C | ENST00000193812 | 0 | 100 | -| 8 | g.25111del | ENST00000174411 | 0 | 294 | -| X | g.1011_1012insAA | ENST00000421914 | 9 | 1211 | -``` -→ Saves 'mut_fasta.fa' file containing: -``` ->1:g.224411A>C -TGCTCTGCT ->8:g.25111del -GAGTCGAT ->X:g.1011_1012insAA -TTAGAACTT -``` -→ Saves 'mutations_updated.csv' file containing: -``` - -| Chromosome | Mutation | Ensembl_Transcript_ID | mutation_type | wt_sequence | mutant_sequence | wt_sequence_full | mutant_sequence_full | wt_sequence_aa_full | mutant_sequence_aa_full | -|------------|-------------------|------------------------|---------------|-------------|-----------------|-------------------|----------------------|---------------------|-------------------------| -| 1 | g.224411A>C | ENSMUST00000193812 | Substitution | TGCTATGCT | TGCTCTGCT | ...TGCTATGCT... | ...TGCTCTGCT... | ...CYA... | ...CSA... | -| 8 | g.25111del | ENST00000174411 | Deletion | GAGTCCGAT | GAGTCGAT | ...GAGTCCGAT... | ...GAGTCGAT... | ...ESD... | ...ES... | -| X | g.1011_1012insAA | ENST00000421914 | Insertion | TTAGCTT | TTAGAACTT | ...TTAGCTT... | ...TTAGAACTT... | ...A... | ...EL... | - +

\ No newline at end of file diff --git a/gget/gget_mutate.py b/gget/gget_mutate.py index 0d04bc64..920bd3ac 100644 --- a/gget/gget_mutate.py +++ b/gget/gget_mutate.py @@ -328,23 +328,10 @@ def mutate( mut_column: str = "mutation", seq_id_column: str = "seq_ID", mut_id_column: Optional[str] = None, - gtf: Optional[str] = None, - gtf_transcript_id_column: Optional[str] = None, - k: int = 30, - min_seq_len: Optional[int] = None, - optimize_flanking_regions: bool = False, - remove_seqs_with_wt_kmers: bool = False, - max_ambiguous: Optional[int] = None, - merge_identical: bool = True, - merge_identical_rc: bool = True, - update_df: bool = False, - update_df_out: Optional[str] = None, - store_full_sequences: bool = False, - translate: bool = False, - translate_start: Union[int, str, None] = None, - translate_end: Union[int, str, None] = None, + k: Optional[int] = None, out: Optional[str] = None, verbose: bool = True, + **kwargs, ): """ Takes in nucleotide sequences and mutations (in standard mutation annotation - see below) @@ -397,7 +384,7 @@ def mutate( Required when 'gtf' is provided. Default: None Mutant sequence generation/filtering options: - - k (int) Length of sequences flanking the mutation. Default: 30. + - k (int) Length of sequences flanking the mutation. Default: None (take entire sequence). If k > total length of the sequence, the entire sequence will be kept. - min_seq_len (int) Minimum length of the mutant output sequence. Mutant sequences smaller than this will be dropped. Default: None @@ -435,11 +422,23 @@ def mutate( Saves mutated sequences in fasta format (or returns a list containing the mutated sequences if out=None). """ + + if kwargs.get("gtf") or kwargs.get("gtf_transcript_id_column") or kwargs.get("optimize_flanking_regions") or kwargs.get("remove_seqs_with_wt_kmers") or kwargs.get("min_seq_len") or kwargs.get("max_ambiguous") or kwargs.get("merge_identical") or kwargs.get("merge_identical_rc") or kwargs.get("update_df") or kwargs.get("update_df_out") or kwargs.get("store_full_sequences") or kwargs.get("translate") or kwargs.get("translate_start") or kwargs.get("translate_end"): + # print a log message and raise exception + logger.critical( + """ + It appears that you are passing in arguments that are not supported anymore in gget mutate. For use of these arguments, please check out https://github.com/pachterlab/kvar. + """ + ) + raise NotImplementedError global intronic_mutations, posttranslational_region_mutations, unknown_mutations, uncertain_mutations, ambiguous_position_mutations, cosmic_incorrect_wt_base, mut_idx_outside_seq columns_to_keep = ["header", seq_id_column, mut_column, "mutation_type", "wt_sequence", "mutant_sequence"] + if k is None: + k = 999999999 # take entire sequence by default + # Load input sequences and their identifiers from fasta file if "." in sequences: titles, seqs = read_fasta(sequences) @@ -565,10 +564,6 @@ def mutate( mutations = add_mutation_type(mutations, mut_column) - # Link sequences to their mutations using the sequence identifiers - if store_full_sequences: - mutations["wt_sequence_full"] = mutations[seq_id_column].map(seq_dict) - # Handle sequences that were not found based on their sequence IDs seqs_not_found = mutations[~mutations[seq_id_column].isin(seq_dict.keys())] if 0 < len(seqs_not_found) < 20: @@ -599,14 +594,16 @@ def mutate( ) total_mutations = mutations.shape[0] - - if mut_id_column is None: - mut_id_column = mut_column - mutations["mutant_sequence"] = "" - mutations["header"] = ( - ">" + mutations[seq_id_column] + ":" + mutations[mut_id_column] - ) + + if mut_id_column is not None: + mutations["header"] = ( + ">" + mutations[mut_id_column] + ) + else: + mutations["header"] = ( + ">" + mutations[seq_id_column] + ":" + mutations[mut_column] + ) # Calculate number of bad mutations uncertain_mutations = mutations[mut_column].str.contains(r"\?").sum() @@ -683,11 +680,6 @@ def mutate( duplication_mask = mutations["mutation_type"] == "duplication" inversion_mask = mutations["mutation_type"] == "inversion" - if remove_seqs_with_wt_kmers: - long_duplications = ((duplication_mask) & ((mutations["end_mutation_position"] - mutations["start_mutation_position"]) >= k)).sum() - logger.info(f"Removing {long_duplications} duplications > k") - mutations = mutations[~((duplication_mask) & ((mutations['end_mutation_position'] - mutations['start_mutation_position']) >= k))] - # Create a mask for all non-substitution mutations non_substitution_mask = ( deletion_mask | delins_mask | insertion_mask | duplication_mask | inversion_mask @@ -783,41 +775,8 @@ def mutate( axis=1 ) # don't forget to increment by 1 later on - if gtf is not None: - assert (mutations_path.endswith(".csv") or mutations_path.endswith(".tsv")), "Mutations must be a CSV or TSV file" - if "start_transcript_position" not in mutations.columns and "end_transcript_position" not in mutations.columns: #* currently hard-coded column names, but optionally can be changed to arguments later - mutations = merge_gtf_transcript_locations_into_cosmic_csv(mutations, gtf, gtf_transcript_id_column = gtf_transcript_id_column) - columns_to_keep.extend(["start_transcript_position", "end_transcript_position", "strand"]) - else: - logger.warning("Transcript positions already present in the input mutations file. Skipping GTF file merging.") - - # adjust start_transcript_position to be 0-index - mutations["start_transcript_position"] -= 1 - - mutations["start_kmer_position"] = mutations[["start_kmer_position", "start_transcript_position"]].max(axis=1) - mutations["end_kmer_position"] = mutations[["end_kmer_position", "end_transcript_position"]].min(axis=1) - mut_apply = (lambda *args, **kwargs: mutations.progress_apply(*args, **kwargs)) if verbose else mutations.apply - if update_df and store_full_sequences: - # Extract flank sequences - if verbose: - tqdm.pandas(desc="Extracting full left flank sequences") - - mutations["left_flank_region_full"] = mut_apply( - lambda row: seq_dict[row[seq_id_column]][0 : row["start_mutation_position"]], axis=1 - ) # ? vectorize - - if verbose: - tqdm.pandas(desc="Extracting full right flank sequences") - - mutations["right_flank_region_full"] = mut_apply( - lambda row: seq_dict[row[seq_id_column]][ - row["end_mutation_position"] + 1 : row["sequence_length"] - ], - axis=1, - ) # ? vectorize - if verbose: tqdm.pandas(desc="Extracting k-mer left flank sequences") @@ -853,52 +812,8 @@ def mutate( # To what extend the beginning of i overlaps with the beginning of d --> shave up to that many nucleotides off the beginning of r1 until k - len(r1) ≥ extent of overlap # To what extend the end of i overlaps with the beginning of d --> shave up to that many nucleotides off the end of r2 until k - len(r2) ≥ extent of overlap - if optimize_flanking_regions: - # Apply the function for beginning of mut_nucleotides with right_flank_region - mutations.loc[ - non_substitution_mask, "beginning_mutation_overlap_with_right_flank" - ] = mutations.loc[non_substitution_mask].apply( - calculate_beginning_mutation_overlap_with_right_flank, axis=1 - ) - - # Apply the function for end of mut_nucleotides with left_flank_region - mutations.loc[non_substitution_mask, "end_mutation_overlap_with_left_flank"] = ( - mutations.loc[non_substitution_mask].apply( - calculate_end_mutation_overlap_with_left_flank, axis=1 - ) - ) - - # Calculate k-len(flank) (see above instructions) - mutations.loc[non_substitution_mask, "k_minus_left_flank_length"] = ( - k - mutations.loc[non_substitution_mask, "left_flank_region"].apply(len) - ) - mutations.loc[non_substitution_mask, "k_minus_right_flank_length"] = ( - k - mutations.loc[non_substitution_mask, "right_flank_region"].apply(len) - ) - - mutations.loc[non_substitution_mask, "updated_left_flank_start"] = np.maximum( - mutations.loc[ - non_substitution_mask, "beginning_mutation_overlap_with_right_flank" - ] - - mutations.loc[non_substitution_mask, "k_minus_left_flank_length"], - 0, - ) - mutations.loc[non_substitution_mask, "updated_right_flank_end"] = np.maximum( - mutations.loc[non_substitution_mask, "end_mutation_overlap_with_left_flank"] - - mutations.loc[non_substitution_mask, "k_minus_right_flank_length"], - 0, - ) - - mutations["updated_left_flank_start"] = ( - mutations["updated_left_flank_start"].fillna(0).astype(int) - ) - mutations["updated_right_flank_end"] = ( - mutations["updated_right_flank_end"].fillna(0).astype(int) - ) - - else: - mutations["updated_left_flank_start"] = 0 - mutations["updated_right_flank_end"] = 0 + mutations["updated_left_flank_start"] = 0 + mutations["updated_right_flank_end"] = 0 # Create WT substitution k-mer sequences mutations.loc[substitution_mask, "wt_sequence"] = ( @@ -934,27 +849,6 @@ def mutate( axis=1, ) - if remove_seqs_with_wt_kmers: - if verbose: - tqdm.pandas(desc="Removing mutant fragments that share a kmer with wt fragments") - - mutations['wt_fragment_and_mutant_fragment_share_kmer'] = mut_apply(lambda row: wt_fragment_and_mutant_fragment_share_kmer(mutated_fragment=row['mutant_sequence'], wildtype_fragment=row['wt_sequence'], k=k+1), axis=1) - - mutations_overlapping_with_wt = mutations['wt_fragment_and_mutant_fragment_share_kmer'].sum() - - mutations = mutations[~mutations['wt_fragment_and_mutant_fragment_share_kmer']] - - - if update_df and store_full_sequences: - columns_to_keep.extend(["wt_sequence_full", "mutant_sequence_full"]) - - # Create full sequences (substitution and non-substitution) - mutations["mutant_sequence_full"] = ( - mutations["left_flank_region_full"] - + mutations["mut_nucleotides"] - + mutations["right_flank_region_full"] - ) - # Calculate k-mer lengths and report the distribution mutations["mutant_sequence_kmer_length"] = mutations["mutant_sequence"].apply( lambda x: len(x) if pd.notna(x) else 0 @@ -962,49 +856,6 @@ def mutate( max_length = mutations["mutant_sequence_kmer_length"].max() - if min_seq_len: - rows_less_than_minimum = (mutations["mutant_sequence_kmer_length"] < min_seq_len).sum() - - mutations = mutations[ - mutations["mutant_sequence_kmer_length"] >= min_seq_len - ] - - if verbose: - logger.info( - f"Removed {rows_less_than_minimum} mutant kmers with length less than {min_seq_len}..." - ) - - if max_ambiguous is not None: - # Get number of 'N' or 'n' occuring in the sequence - mutations['num_N'] = mutations['mutant_sequence'].str.lower().str.count('n') - num_rows_with_N = (mutations['num_N'] > max_ambiguous).sum() - mutations = mutations[mutations['num_N'] <= max_ambiguous] - - if verbose: - logger.info( - f"Removed {num_rows_with_N} mutant kmers containing more than {max_ambiguous} 'N's..." - ) - - # Drop the 'num_N' column after filtering - mutations = mutations.drop(columns=['num_N']) - - try: - # Create bins of width 5 from 0 to max_length - bins = range(0, max_length + 6, 5) - - # Bin the lengths and count the number of elements in each bin - binned_lengths = pd.cut( - mutations["mutant_sequence_kmer_length"], bins=bins, right=False - ) - bin_counts = binned_lengths.value_counts().sort_index() - - # Display the report - if verbose: - logger.debug("Report of the number of elements in each bin of width 5:") - logger.debug(bin_counts) - except Exception as e: - pass - # split_cols = mutations[mut_id_column].str.split("_", n=1, expand=True) # if split_cols.shape[1] == 1: @@ -1039,177 +890,13 @@ def mutate( {mut_idx_outside_seq} mutations with indices outside of the sequence length found ({mut_idx_outside_seq/total_mutations*100:.2f}%) """ - if remove_seqs_with_wt_kmers: - report += f"""{long_duplications} duplications longer than k found ({long_duplications/total_mutations*100:.2f}%) - {mutations_overlapping_with_wt} mutations with overlapping kmers found ({mutations_overlapping_with_wt/total_mutations*100:.2f}%) - """ - - if min_seq_len: - report += f"""{rows_less_than_minimum} mutations with fragment length < k found ({rows_less_than_minimum/total_mutations*100:.2f}%) - """ - - if max_ambiguous is not None: - report += f"""{num_rows_with_N} mutations with Ns found ({num_rows_with_N/total_mutations*100:.2f}%) - """ - if good_mutations != total_mutations: logger.warning(report) else: logger.info("All mutations correctly recorded") - if translate and update_df and store_full_sequences: - columns_to_keep.extend(["wt_sequence_aa_full", "mutant_sequence_aa_full"]) - - if not mutations_path: - assert ( - type(translate_start) != str and type(translate_end) != str - ), "translate_start and translate_end must be integers when translating sequences (or default None)." - if translate_start is None: - translate_start = 0 - if translate_end is None: - translate_end = mutations["sequence_length"][0] - - # combined_df['ORF'] = combined_df[translate_start] % 3 - - if verbose: - tqdm.pandas(desc="Translating WT amino acid sequences") - mutations["wt_sequence_aa_full"] = mutations["wt_sequence_full"].progress_apply( - lambda x: translate_sequence( - x, start=translate_start, end=translate_end - ) - ) - else: - mutations["wt_sequence_aa_full"] = mutations["wt_sequence_full"].apply( - lambda x: translate_sequence( - x, start=translate_start, end=translate_end - ) - ) - - if verbose: - tqdm.pandas(desc="Translating mutant amino acid sequences") - - mutations["mutant_sequence_aa_full"] = mutations[ - "mutant_sequence_full" - ].progress_apply( - lambda x: translate_sequence( - x, start=translate_start, end=translate_end - ) - ) - - else: - mutations["mutant_sequence_aa_full"] = mutations[ - "mutant_sequence_full" - ].apply( - lambda x: translate_sequence( - x, start=translate_start, end=translate_end - ) - ) - - print( - f"Translated mutated sequences: {mutations['wt_sequence_aa_full']}" - ) - else: - if not translate_start: - translate_start = "translate_start" - - if not translate_end: - translate_end = "translate_end" - - if translate_start not in mutations.columns: - mutations["translate_start"] = 0 - - if translate_end not in mutations.columns: - mutations["translate_end"] = mutations["sequence_length"] - - if verbose: - tqdm.pandas(desc="Translating WT amino acid sequences") - - mutations["wt_sequence_aa_full"] = mut_apply( - lambda row: translate_sequence( - row["wt_sequence_full"], row[translate_start], row[translate_end] - ), - axis=1, - ) - - if verbose: - tqdm.pandas(desc="Translating mutant amino acid sequences") - - mutations["mutant_sequence_aa_full"] = mut_apply( - lambda row: translate_sequence( - row["mutant_sequence_full"], - row[translate_start], - row[translate_end], - ), - axis=1, - ) - mutations = mutations[columns_to_keep] - if merge_identical: - logger.info("Merging identical mutated sequences") - - if merge_identical_rc: - mutations['mutant_sequence_rc'] = mutations['mutant_sequence'].apply(reverse_complement) - - # Create a column that stores a sorted tuple of (mutant_sequence, mutant_sequence_rc) - mutations['mutant_sequence_and_rc_tuple'] = mutations.apply( - lambda row: tuple(sorted([row['mutant_sequence'], row['mutant_sequence_rc']])), - axis=1 - ) - - # mutations = mutations.drop(columns=['mutant_sequence_rc']) - - group_key = 'mutant_sequence_and_rc_tuple' - columns_not_to_semicolon_join = ['mutant_sequence', 'mutant_sequence_rc'] - else: - group_key = 'mutant_sequence' - columns_not_to_semicolon_join = [] - - if update_df: - logger.warning("Merging identical mutated sequences can take a while if update_df=True since it will concatenate all MCRSs too)") - mutations = ( - mutations.groupby(group_key, sort=False) - .agg({col: 'first' if col in columns_not_to_semicolon_join else lambda x: ";".join(x.astype(str)) for col in mutations.columns}) # Concatenate values with semicolons - .reset_index(drop=merge_identical_rc) # drop if merging by mutant_sequence_and_rc_tuple, but not if merging by mutant_sequence - ) - - else: - mutations_temp = ( - mutations.groupby(group_key, sort=False, group_keys=False)["header"] - .apply(";".join) - .reset_index() - ) - - if merge_identical_rc: - mutations_temp = mutations_temp.merge(mutations[['mutant_sequence', group_key]], on=group_key, how="left") - mutations_temp = mutations_temp.drop_duplicates(subset='header') - mutations_temp.drop(columns=[group_key], inplace=True) - - mutations = mutations_temp - - # apply remove_gt_after_semicolon to mutant_sequence - mutations["header"] = mutations["header"].apply( - remove_gt_after_semicolon - ) - - # Calculate the number of semicolons in each entry - mutations['semicolon_count'] = mutations['header'].str.count(';') - - mutations['semicolon_count'] += 1 - - # Convert all 1 values to NaN - mutations['semicolon_count'] = mutations['semicolon_count'].replace(1, np.nan) - - # Take the sum across all rows of the new column - total_semicolons = int(mutations['semicolon_count'].sum()) - - mutations = mutations.drop(columns=['semicolon_count']) - - if verbose: - logger.info( - f"{total_semicolons} identical mutated sequences were merged (headers were combined and separated using a semicolon (;). Occurences of identical mutated sequences may be reduced by increasing k." - ) - empty_kmer_count = (mutations["mutant_sequence"] == "").sum() if empty_kmer_count > 0 and verbose: @@ -1221,21 +908,6 @@ def mutate( mutations['header'] = mutations['header'].str[1:] # remove the > character - if update_df: - logger.info("Saving dataframe with updated mutation info...") - saved_updated_df = True - logger.warning("File size can be very large if the number of mutations is large.") - if not update_df_out: - if not mutations_path: - logger.warning("mutations_path must be provided if update_df is True and update_df_out is not provided.") - saved_updated_df = False - else: - base_name, ext = os.path.splitext(mutations_path) - update_df_out = f"{base_name}_updated{ext}" - if saved_updated_df: - mutations.to_csv(update_df_out, index=False) - print(f"Updated mutation info has been saved to {update_df_out}") - mutations["fasta_format"] = ( ">" + mutations["header"] + "\n" + mutations["mutant_sequence"] + "\n" ) diff --git a/gget/main.py b/gget/main.py index 4419931b..aa1718fa 100644 --- a/gget/main.py +++ b/gget/main.py @@ -1981,115 +1981,13 @@ def main(): required=False, help="Name of the column containing the IDs of each mutation in 'mutations'. Default: Same as 'mut_column'.", ) - parser_mutate.add_argument( - "-gtf", - "--gtf", - default=None, - type=str, - required=False, - help="Path to a .gtf file. When providing a genome fasta file as input for 'sequences', you can provide a .gtf file here and the input sequences will be defined according to the transcript boundaries, e.g. 'path/to/genome_annotation.gtf'.", - ) - parser_mutate.add_argument( - "-gtic", - "--gtf_transcript_id_column", - default=None, - type=str, - required=False, - help="Column name in the input 'mutations' file containing the transcript ID. In this case, column 'seq_id_column' should contain the chromosome number. Required when 'gtf' is provided.", - ) parser_mutate.add_argument( "-k", "--k", - default=30, - type=int, - required=False, - help="Length of sequences flanking the mutation. If k > total length of the sequence, the entire sequence will be kept.", - ) - parser_mutate.add_argument( - "-msl", - "--min_seq_len", - default=None, - type=int, - required=False, - help="Minimum length of the mutant output sequence, e.g. 100. Mutant sequences smaller than this will be dropped.", - ) - parser_mutate.add_argument( - "-ma", - "--max_ambiguous", default=None, type=int, required=False, - help="Maximum number of 'N' (or 'n') characters allowed in the output sequence, e.g. 10. Default: None (no ambiguous character filter will be applied).", - ) - parser_mutate.add_argument( - "-ofr", - "--optimize_flanking_regions", - default=False, - action="store_true", - required=False, - help="Removes nucleotides from either end of the mutant sequence to ensure (when possible) that the mutant sequence does not contain any k-mers also found in the wildtype/input sequence.", - ) - parser_mutate.add_argument( - "-rswk", - "--remove_seqs_with_wt_kmers", - default=False, - action="store_true", - required=False, - help="Removes output sequences where at least one k-mer is also present in the wildtype/input sequence in the same region. When used with `--optimize_flanking_regions`, only sequences for which a wildtpye kmer is still present after optimization will be removed.", - ) - parser_mutate.add_argument( - "-mio", - "--merge_identical_off", - default=True, - action="store_false", - required=False, - help="Do not merge identical mutant sequences in the output (by default, identical sequences will be merged by concatenating the sequence headers for all identical sequences).", - ) - parser_mutate.add_argument( - "-udf", - "--update_df", - default=False, - action="store_true", - required=False, - help="Updates the input `mutations` DataFrame to include additional columns with the mutation type, wildtype nucleotide sequence, and mutant nucleotide sequence (only valid if `mutations` is a .csv or .tsv file).", - ) - parser_mutate.add_argument( - "-udf_o", - "--update_df_out", - default=None, - type=str, - required=False, - help="Path to output csv file containing the updated DataFrame, e.g. 'path/to/mutations_updated.csv'. Only valid when used with `--update_df`. Default: None -> the new csv file will be saved in the same directory as the `mutations` DataFrame with appendix '_updated'.", - ) - parser_mutate.add_argument( - "--translate", - default=None, - action="store_true", - required=False, - help="Adds additional columns to the updated `mutations` DataFrame containing the wildtype and mutant amino acid sequences. Only valid when used with `--store_full_sequences`.", - ) - parser_mutate.add_argument( - "-ts", - "--translate_start", - default=None, - type=int_or_str, - required=False, - help="(int or str) The position in the input nucleotide sequence to start translating, e.g. 5. If a string is provided, it should correspond to a column name in `mutations` containing the open reading frame start positions for each sequence/mutation. Only valid when used with `--translate`. Default: translates from the beginning of each sequence.", - ) - parser_mutate.add_argument( - "--translate_end", - default=None, - type=int_or_str, - required=False, - help="(int or str) The position in the input nucleotide sequence to end translating, e.g. 35. If a string is provided, it should correspond to a column name in `mutations` containing the open reading frame end positions for each sequence/mutation. Only valid when used with `--translate`. Default: translates until the end of each sequence.", - ) - parser_mutate.add_argument( - "-sfs", - "--store_full_sequences", - default=False, - action="store_true", - required=False, - help="Includes the complete wildtype and mutant sequences in the updated `mutations` DataFrame (not just the sub-sequence with k-length flanks). Only valid when used with `--update_df`.", + help="Length of sequences flanking the mutation. If k is None or k > total length of the sequence, the entire sequence will be kept. Default: None", ) parser_mutate.add_argument( "-o", @@ -2641,23 +2539,10 @@ def main(): mutate_results = mutate( sequences=seqs, mutations=muts, - gtf=args.gtf, - gtf_transcript_id_column=args.gtf_transcript_id_column, k=args.k, mut_column=args.mut_column, mut_id_column=args.mut_id_column, seq_id_column=args.seq_id_column, - min_seq_len=args.min_seq_len, - max_ambiguous=args.max_ambiguous, - optimize_flanking_regions=args.optimize_flanking_regions, - remove_seqs_with_wt_kmers=args.remove_seqs_with_wt_kmers, - merge_identical=args.merge_identical_off, - update_df=args.update_df, - update_df_out=args.update_df_out, - store_full_sequences=args.store_full_sequences, - translate=args.translate, - translate_start=args.translate_start, - translate_end=args.translate_end, out=args.out, verbose=args.quiet, ) diff --git a/tests/fixtures/test_mutate.json b/tests/fixtures/test_mutate.json index e311a66b..1fa8c754 100644 --- a/tests/fixtures/test_mutate.json +++ b/tests/fixtures/test_mutate.json @@ -4,7 +4,7 @@ "args": { "sequences": "", "mutations": "c.35G>A", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": "GCCCCACCCCGCCCCTCCCCGCCCCACCCCACCCCTCCCCGCCCCACCCCGCCCCTCCCCG", "global_variables": {} @@ -14,7 +14,7 @@ "args": { "sequences": "", "mutations": "c.65G>A", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": "GCCCCTCCCCGCCCCACCCCGCCCCTCCCCACCCCACCCCG", "global_variables": {} @@ -24,7 +24,7 @@ "args": { "sequences": "", "mutations": "c.5G>A", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": "CCCCACCCCACCCCGCCCCTCCCCGCCCCACCCCG", "global_variables": {} @@ -34,7 +34,7 @@ "args": { "sequences": "", "mutations": "c.35del", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": "GCCCCACCCCGCCCCTCCCCGCCCCACCCCCCCCTCCCCGCCCCACCCCGCCCCTCCCCG", "global_variables": {} @@ -44,7 +44,7 @@ "args": { "sequences": "", "mutations": "c.35_40del", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": "GCCCCACCCCGCCCCTCCCCGCCCCACCCCCCCCGCCCCACCCCGCCCCTCCCCGCCCCA", "global_variables": {} @@ -54,9 +54,9 @@ "args": { "sequences": "", "mutations": "c.31del", - "optimize_flanking_regions": true + "k": 30 }, - "expected_result": "CGCCCCACCCCGCCCCTCCCCGCCCCACCCGCCCCTCCCCGCCCCACCCCGCCCCTC", + "expected_result": "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCGCCCCTCCCCGCCCCACCCCGCCCCTC", "global_variables": {} }, "test_single_deletion_with_left_repeats": { @@ -64,9 +64,9 @@ "args": { "sequences": "", "mutations": "c.34del", - "optimize_flanking_regions": true + "k": 30 }, - "expected_result": "CGCCCCACCCCGCCCCTCCCCGCCCCACCCGCCCCTCCCCGCCCCACCCCGCCCCTC", + "expected_result": "CGCCCCACCCCGCCCCTCCCCGCCCCACCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCC", "global_variables": {} }, "test_multi_deletion_with_right_repeats": { @@ -74,9 +74,9 @@ "args": { "sequences": "", "mutations": "c.31_32del", - "optimize_flanking_regions": true + "k": 30 }, - "expected_result": "CCGCCCCACCCCGCCCCTCCCCGCCCCACCGCCCCTCCCCGCCCCACCCCGCCCCTCC", + "expected_result": "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCGCCCCTCCCCGCCCCACCCCGCCCCTCC", "global_variables": {} }, "test_single_insertion": { @@ -84,7 +84,7 @@ "args": { "sequences": "", "mutations": "c.4_5insT", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": "CCCCTGCCCCACCCCGCCCCTCCCCGCCCCACCCC", "global_variables": {} @@ -94,7 +94,7 @@ "args": { "sequences": "", "mutations": "c.65_66insTTTTT", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": "CCCCTCCCCGCCCCACCCCGCCCCTCCCCGTTTTTCCCCACCCCG", "global_variables": {} @@ -104,7 +104,7 @@ "args": { "sequences": "", "mutations": "c.20_21insCCAAA", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": "CCCCGCCCCACCCCGCCCCTCCAAACCCCGCCCCACCCCGCCCCTCCCCGCCCCA", "global_variables": {} @@ -114,7 +114,7 @@ "args": { "sequences": "", "mutations": "c.38delinsAAA", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": "CCACCCCGCCCCTCCCCGCCCCACCCCGCCAAACTCCCCGCCCCACCCCGCCCCTCCCCGCCC", "global_variables": {} @@ -124,7 +124,7 @@ "args": { "sequences": "", "mutations": "c.38_40delinsAAA", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": "CCACCCCGCCCCTCCCCGCCCCACCCCGCCAAACCCCGCCCCACCCCGCCCCTCCCCGCCCCA", "global_variables": {} @@ -134,7 +134,7 @@ "args": { "sequences": "", "mutations": "c.36_37delinsAG", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": "CCCCACCCCGCCCCTCCCCGCCCCACCCCGAGCCTCCCCGCCCCACCCCGCCCCTCCCCGCC", "global_variables": {} @@ -144,9 +144,9 @@ "args": { "sequences": "", "mutations": "c.36_37delinsAC", - "optimize_flanking_regions": true + "k": 30 }, - "expected_result": "CCCCACCCCGCCCCTCCCCGCCCCACCCCGACCCTCCCCGCCCCACCCCGCCCCTCCCCGC", + "expected_result": "CCCCACCCCGCCCCTCCCCGCCCCACCCCGACCCTCCCCGCCCCACCCCGCCCCTCCCCGCC", "global_variables": {} }, "test_multi_delins_with_true_right_repeats": { @@ -154,9 +154,9 @@ "args": { "sequences": "", "mutations": "c.36_37delinsCA", - "optimize_flanking_regions": true + "k": 30 }, - "expected_result": "CCCACCCCGCCCCTCCCCGCCCCACCCCGCACCTCCCCGCCCCACCCCGCCCCTCCCCGCC", + "expected_result": "CCCCACCCCGCCCCTCCCCGCCCCACCCCGCACCTCCCCGCCCCACCCCGCCCCTCCCCGCC", "global_variables": {} }, "test_single_dup": { @@ -164,9 +164,9 @@ "args": { "sequences": "", "mutations": "c.35dup", - "optimize_flanking_regions": true + "k": 30 }, - "expected_result": "CCCCACCCCGCCCCTCCCCGCCCCACCCCGGCCCCTCCCCGCCCCACCCCGCCCCTCCCC", + "expected_result": "CCCCACCCCGCCCCTCCCCGCCCCACCCCGGCCCCTCCCCGCCCCACCCCGCCCCTCCCCG", "global_variables": {} }, "test_multi_dup": { @@ -174,9 +174,9 @@ "args": { "sequences": "", "mutations": "c.35_37dup", - "optimize_flanking_regions": true + "k": 30 }, - "expected_result": "CCACCCCGCCCCTCCCCGCCCCACCCCGCCGCCCCTCCCCGCCCCACCCCGCCCCTCC", + "expected_result": "CCACCCCGCCCCTCCCCGCCCCACCCCGCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCC", "global_variables": {} }, "test_inversion_with_overlaps": { @@ -184,9 +184,9 @@ "args": { "sequences": "", "mutations": "c.35_38inv", - "optimize_flanking_regions": true + "k": 30 }, - "expected_result": "CCCCACCCCGCCCCTCCCCGCCCCACCCCGGGCCTCCCCGCCCCACCCCGCCCCTCCCCGCC", + "expected_result": "GCCCCACCCCGCCCCTCCCCGCCCCACCCCGGGCCTCCCCGCCCCACCCCGCCCCTCCCCGCCC", "global_variables": {} }, "test_list_of_mutations": { @@ -213,7 +213,7 @@ "args": { "sequences": "", "mutations": "c.20+3T>A", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": null, "global_variables": { @@ -225,7 +225,7 @@ "args": { "sequences": "", "mutations": "c.20-3T>A", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": null, "global_variables": { @@ -237,7 +237,7 @@ "args": { "sequences": "", "mutations": "c.20*5T>A", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": null, "global_variables": { @@ -249,7 +249,7 @@ "args": { "sequences": "", "mutations": "c.?", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": null, "global_variables": { @@ -261,7 +261,7 @@ "args": { "sequences": "", "mutations": "c.(20_28)del", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": null, "global_variables": { @@ -273,7 +273,7 @@ "args": { "sequences": "", "mutations": "c.99999999C>A", - "optimize_flanking_regions": true + "k": 30 }, "expected_result": null, "global_variables": { @@ -285,53 +285,9 @@ "args": { "sequences": "", "mutations": "c.40T>G", - "optimize_flanking_regions": true, - "k": 54 + "k": 30 }, "expected_result": "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCGCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCC", "global_variables": {} - }, - "test_large_min_seq_length": { - "type": "assert_mutate", - "args": { - "sequences": "", - "mutations": "c.35G>A", - "optimize_flanking_regions": true, - "min_seq_len": 100 - }, - "expected_result": null, - "global_variables": {} - }, - "test_single_deletion_with_right_repeats_and_unoptimized_flanks": { - "type": "assert_mutate", - "args": { - "sequences": "", - "mutations": "c.31del", - "optimize_flanking_regions": false - }, - "expected_result": "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCGCCCCTCCCCGCCCCACCCCGCCCCTC", - "global_variables": {} - }, - "test_single_deletion_with_right_repeats_and_removing_seqs_with_wt_kmers": { - "type": "assert_mutate", - "args": { - "sequences": "", - "mutations": "c.31del", - "optimize_flanking_regions": false, - "remove_seqs_with_wt_kmers": true - }, - "expected_result": null, - "global_variables": {} - }, - "test_sequence_with_N": { - "type": "assert_mutate_N", - "args": { - "sequences": "", - "mutations": "c.35G>A", - "optimize_flanking_regions": true, - "max_ambiguous": 0 - }, - "expected_result": null, - "global_variables": {} } } \ No newline at end of file