Skip to content

Cleaned up commandline arguments #57

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 10, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ varcode>=0.3.17
pylint>=1.4.4
nose>=1.3.6
gtfparse>=0.0.4
mhcnames
26 changes: 10 additions & 16 deletions scripts/topiary
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

# Copyright (c) 2015. Mount Sinai School of Medicine
# Copyright (c) 2015-2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -33,32 +33,26 @@ Example usage:
from topiary.commandline_args import (
arg_parser,
rna_gene_expression_dict_from_args,
rna_transcript_expression_dict_from_args
rna_transcript_expression_dict_from_args,
write_outputs,
)
from topiary import predict_epitopes_from_args, epitopes_to_dataframe


args = arg_parser.parse_args()

def main():
print("Topiary commandline arguments:")
print(args)
def main(args):
epitopes = predict_epitopes_from_args(args)
gene_expression_dict = rna_gene_expression_dict_from_args(args)
transcript_expression_dict = rna_transcript_expression_dict_from_args(args)
df = epitopes_to_dataframe(
epitopes,
gene_expression_dict=gene_expression_dict,
transcript_expression_dict=transcript_expression_dict)
print("First 10 epitope predictions:")
print(df.ix[:10])
write_outputs(df, args)
print("Total count: %d" % len(df))
if args.output_csv:
print("Saving %s..." % args.output_csv)
df.to_csv(args.output_csv, index=True, index_label="#")
if args.output_html:
print("Saving %s..." % args.output_html)
df.to_html(args.output_html, index=True)

if __name__ == "__main__":
main()
args = arg_parser.parse_args()
print("Topiary commandline arguments:")
print(args)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to print this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I'm finding it handy to see how the commandline args got parsed. Do you think it's too noisy?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haven't used this to see whether it is, but just wanted to point it out in case it was a holdover of debugging code


main(args)
79 changes: 49 additions & 30 deletions scripts/topiary-analyze-cohort
Original file line number Diff line number Diff line change
Expand Up @@ -54,65 +54,77 @@ from .mutation_report import print_mutation_report
parser = argparse.ArgumentParser()

input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument("--variant-input-dir",
type=str,
help="Directory containing MAF or VCF input files")
input_group.add_argument(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For me this is the first time looking at this, worth commenting on what goes in the input_group vs general args?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's an old script that I wrote for Kipp Akers' analysis of TCGA, I'm a little hesitant to try and understand it again. I was just reformatting the whitespace. I think eventually this should either get deleted or "modernized" to use the commandline args from the main Topiary script.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

"--variant-input-dir",
type=str,
help="Directory containing MAF or VCF input files")

input_group.add_argument("--variant-input-file",
type=str,
help="Single MAF or VCF input file")
input_group.add_argument(
"--variant-input-file",
type=str,
help="Single MAF or VCF input file")

parser.add_argument("--hla-dir",
parser.add_argument(
"--hla-dir",
type=str,
default=None,
help=("Directory containing HLA allele files (with suffix .hla)"))

parser.add_argument("--output-counts-csv",
parser.add_argument(
"--output-counts-csv",
default="analyze_cohort_results.csv",
help="Path to output file containing mutation/epitope counts")

parser.add_argument("--quiet",
parser.add_argument(
"--quiet",
type=str,
help="Suppress INFO log messages")

parser.add_argument("--binding-threshold",
parser.add_argument(
"--binding-threshold",
type=int,
default=500,
help="Cutoff IC50 score for epitope MHC binding")

parser.add_argument("--combined-maf",
parser.add_argument(
"--combined-maf",
default=False,
action="store_true",
help=("Rather than using filenames to identify patients, "
"a single MAF file can have multiple tumor barcodes."))

parser.add_argument("--rna-filter-dir",
parser.add_argument(
"--rna-filter-dir",
type=str,
default=None,
help=("Directory containing RNASeq gene expression "
"levels (one file per patient). If provided, we "
"filter mutations with no gene expression."))

parser.add_argument("--debug-patient-id",
parser.add_argument(
"--debug-patient-id",
type=str,
default=None,
help=("If we have a directory or a file containing "
"multiple patient IDs, limit that collection to "
"one specific patient ID for debugging."))

parser.add_argument("--debug-scored-epitopes-csv",
parser.add_argument(
"--debug-scored-epitopes-csv",
type=str,
default=None,
help=("If we have a CSV file representing scored "
"epitopes, use that instead of running netMHCpan. "
"If not, generate that CSV file."))

parser.add_argument("--netmhc-cons",
parser.add_argument(
"--netmhc-cons",
default=False,
action="store_true",
help="Use local NetMHCcons binding predictor (otherwise use NetMHCpan)")

parser.add_argument("--resume",
parser.add_argument(
"--resume",
default=False,
action="store_true",
help="Append to an existing output file")
Expand Down Expand Up @@ -168,12 +180,16 @@ def find_mutation_files(


def collect_hla_files(input_dir_string):
return collect_files(input_dir_string, read_hla_file,
return collect_files(
input_dir_string,
read_hla_file,
permissive_parsing=True)


def collect_gene_exp_files(input_dir_string):
return collect_files(input_dir_string, read_gene_exp_file,
return collect_files(
input_dir_string,
read_gene_exp_file,
permissive_parsing=True)


Expand Down Expand Up @@ -212,7 +228,7 @@ def read_gene_exp_file(path, permissive_parsing):
count_col = gene_exp_df.columns[1]
if permissive_parsing:
gene_exp_df[gene_col] = gene_exp_df[gene_col].str.split('|').map(
lambda x: x[0])
lambda x: x[0])
gene_exp_df = gene_exp_df[gene_exp_df[count_col] > 0]
return set(gene_exp_df[gene_col].tolist())

Expand Down Expand Up @@ -286,13 +302,15 @@ def generate_mutation_counts(
scored_epitopes = pd.read_csv(csv_file)
else:
mhc = make_mhc_predictor()
scored_epitopes = mhc.predict(transcripts_df,
mutation_window_size=9)
scored_epitopes = mhc.predict(
transcripts_df,
mutation_window_size=9)
scored_epitopes.to_csv(csv_file)
else:
mhc = make_mhc_predictor()
scored_epitopes = mhc.predict(transcripts_df,
mutation_window_size=9)
scored_epitopes = mhc.predict(
transcripts_df,
mutation_window_size=9)

if not args.quiet:
print scored_epitopes
Expand Down Expand Up @@ -331,12 +349,13 @@ def generate_mutation_counts(
curr_immunogenic_epitopes = immunogenic_epitopes.groupby(['Epitope']).first()
n_immunogenic_epitopes += len(curr_immunogenic_epitopes)
n_immunogenic_mutations += len(curr_immunogenic_epitopes) > 0
logging.info(("%s %s: epitopes %s, ligands %d, imm %d"),
gene,
mut,
n_curr_epitopes,
n_curr_ligands,
len(curr_immunogenic_epitopes))
logging.info(
("%s %s: epitopes %s, ligands %d, imm %d") % (
gene,
mut,
n_curr_epitopes,
n_curr_ligands,
len(curr_immunogenic_epitopes)))
result_tuple = (
n_coding_mutations,
n_epitopes,
Expand Down Expand Up @@ -441,4 +460,4 @@ if __name__ == "__main__":
n_ligand_mutations,
n_ligands,
n_immunogenic_mutations,
n_immunogenic_epitopes))
n_immunogenic_epitopes))
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@
'mhctools >=0.1.8',
'varcode >=0.3.17',
'nose >=1.3.6',
'gtfparse >=0.0.4'
'gtfparse >=0.0.4',
'mhcnames',
],
long_description=readme,
packages=find_packages(exclude="test"),
Expand Down
34 changes: 34 additions & 0 deletions test/test_args_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from topiary.commandline_args import arg_parser, write_outputs
import tempfile
import pandas as pd
from nose.tools import eq_


def test_write_outputs():

with tempfile.NamedTemporaryFile(mode="r+", delete=False) as f:
df = pd.DataFrame({
"x": [1, 2, 3],
"y": [10, 20, 30]
})
args = arg_parser.parse_args([
"--output-csv", f.name,
"--subset-output-columns", "x",
"--rename-output-column", "x", "X",
"--mhc-predictor", "random",
"--mhc-alleles", "A0201",
])

write_outputs(
df,
args,
print_df_before_filtering=True,
print_df_after_filtering=True)
print("File: %s" % f.name)
df_from_file = pd.read_csv(f.name, index_col="#")

df_expected = pd.DataFrame({
"X": [1, 2, 3]})
print(df_from_file)
eq_(len(df_expected), len(df_from_file))
assert (df_expected == df_from_file).all().all()
2 changes: 1 addition & 1 deletion topiary/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)
from . import commandline_args

__version__ = '0.0.19'
__version__ = '0.0.20'

__all__ = [
"LazyLigandomeDict",
Expand Down
Loading