Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow for returning a dataframe from gget.mutate, w/ more context #169

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 23 additions & 17 deletions gget/gget_mutate.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ def mutate(
The identifiers (following the '>') of the mutated sequences in the output fasta will be '>[seq_ID]_[mut_ID]'.
- verbose (True/False) whether to print progress information. Default: True

Saves mutated sequences in fasta format (or returns a list containing the mutated sequences if out=None).
Saves mutated sequences in fasta format (or, if out=None: when update_df is True, returns the mutation dataframe, otherwise returns a list containing the mutated sequences).
"""

global intronic_mutations, posttranslational_region_mutations, unknown_mutations, uncertain_mutations, ambiguous_position_mutations, cosmic_incorrect_wt_base, mut_idx_outside_seq
Expand All @@ -471,6 +471,8 @@ def mutate(
"mutation_type",
"wt_sequence",
"mutant_sequence",
"start_mutation_position",
"end_mutation_position"
]

# Load input sequences and their identifiers from fasta file
Expand Down Expand Up @@ -672,7 +674,7 @@ def mutate(

if mutations.empty:
logger.warning("No valid mutations found in the input.")
return []
return mutations if update_df else []

# Split nucleotide positions into start and end positions
split_positions = mutations["nucleotide_positions"].str.split("_", expand=True)
Expand Down Expand Up @@ -713,7 +715,7 @@ def mutate(

if mutations.empty:
logger.warning("No valid mutations found in the input.")
return []
return mutations if update_df else []

# Create masks for each type of mutation
mutations["wt_nucleotides_ensembl"] = None
Expand Down Expand Up @@ -789,7 +791,7 @@ def mutate(

if mutations.empty:
logger.warning("No valid mutations found in the input.")
return []
return mutations if update_df else []

# Adjust the start and end positions for insertions
mutations.loc[
Expand Down Expand Up @@ -1310,9 +1312,9 @@ def mutate(
)
if not update_df_out:
if not mutations_path:
logger.warning(
"mutations_path must be provided if update_df is True and update_df_out is not provided."
)
# logger.warning(
# "mutations_path must be provided if update_df is True and update_df_out is not provided."
# )
saved_updated_df = False
else:
base_name, ext = os.path.splitext(mutations_path)
Expand All @@ -1335,13 +1337,17 @@ def mutate(

# When out=None, return list of mutated seqs
else:
all_mut_seqs = []
all_mut_seqs.extend(mutations["mutant_sequence"].values)

# Remove empty strings from final list of mutated sequences
# (these are introduced when unknown mutations are encountered)
while "" in all_mut_seqs:
all_mut_seqs.remove("")

if len(all_mut_seqs) > 0:
return all_mut_seqs
if update_df:
return mutations[columns_to_keep]
else:
all_mut_seqs = []
all_mut_seqs.extend(mutations["mutant_sequence"].values)

# Remove empty strings from final list of mutated sequences
# (these are introduced when unknown mutations are encountered)
while "" in all_mut_seqs:
all_mut_seqs.remove("")

if len(all_mut_seqs) > 0:
return all_mut_seqs
return []