Skip to content

Commit

Permalink
Merge pull request #1003 from griffithlab/arriba_invalid_characters
Browse files Browse the repository at this point in the history
Handle invalid characters in Arriba input files
  • Loading branch information
susannasiebert authored Aug 7, 2023
2 parents 2f12cf3 + 8f73017 commit 4f83901
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 5 deletions.
2 changes: 2 additions & 0 deletions docs/pvacseq/input_file_prep/proximal_vcf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ ______________________________
Phase variants using GATK’s ReadBackedPhasing
_____________________________________________

Unfortunately, the tool used for this step is no longer available in current versions of GATK. We recommend using GATK 3.6.0 to run this step.

.. code-block:: none
/usr/bin/java -Xmx16g -jar /opt/GenomeAnalysisTK.jar \
Expand Down
12 changes: 8 additions & 4 deletions pvactools/lib/fasta_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ def __init__(self, **kwargs):
self.proximal_variants_file = kwargs.pop('proximal_variants_file', None)
self.proximal_variants = self.parse_proximal_variants_file()


def contains_invalid_characters(self, sequence):
for character in ['*', 'X', '?']:
if character in sequence:
return True
return False

def position_out_of_bounds(self, position, sequence):
return position > len(sequence)-1

Expand Down Expand Up @@ -305,10 +312,7 @@ def execute(self):
if subsequence.endswith('X'):
subsequence = subsequence[:-1]

if '*' in subsequence:
continue

if 'X' in subsequence:
if self.contains_invalid_characters(subsequence):
continue

if len(subsequence) < self.epitope_length:
Expand Down
2 changes: 1 addition & 1 deletion pvactools/tools/pvacfuse/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def main(args_input = sys.argv[1:]):
for epitope_length in epitope_lengths:
(input_file, per_epitope_output_dir) = generate_fasta(args, output_dir, epitope_length)
if os.path.getsize(input_file) == 0:
print("The intermediate FASTA file for epitope length {} is empty. Please check that the input AGfusion directory contains fusion entries with `*_protein.fa` files. Fusion entries without this file cannot be processed by pVACfuse.".format(epitope_length))
print("The intermediate FASTA file for epitope length {} is empty. No processable fusions found.")
continue

run_arguments['input_file'] = input_file
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#gene1 gene2 strand1(gene/fusion) strand2(gene/fusion) breakpoint1 breakpoint2 site1 site2 type split_reads1 split_reads2 discordant_mates coverage1 coverage2 confidence reading_frame tags retained_protein_domains closest_genomic_breakpoint1 closest_genomic_breakpoint2 gene_id1 gene_id2 transcript_id1 transcript_id2 direction1 direction2 filters fusion_transcript peptide_sequence read_identifiers
PTEN ENSG00000200891(21548),MED6P1(31892) +/+ ./+ chr10:87952259 chr10:88016243 CDS/splice-site intergenic deletion/read-through 163 146 6 3691 3002 high out-of-frame . C2_domain_of_PTEN_tumour-suppressor_protein(100%),Dual_specificity_phosphatase__catalytic_domain(100%)| . . ENSG00000171862 . ENST00000371953 . downstream upstream duplicates(384),low_entropy(4),mismappers(15),mismatches(6),multimappers(2) CCTCACCTCCATGCAGATGCAGCTGTACCTGCAGCAGCTGCAGAAGGTGCAGCCCCCTACGCCGCTACTCCCTTCCGTGAAGGTGCAGTCCCAGCCCCCcCCCCCCCccCCcCCCCCcCCCCcCCCC|CCC??CCCCC?CCCCCCCTGCCGCCCCCACCCCACCCCTCTGTGCAGCAGCAGCTGCAGCAGCAGCCGCCACCACCCCCACCACCCCAGCCCCAGCCTCCACCCCAGCAGCAGCATCAGCCCCCTCCACGGCCCGTGCACTTGCAGCCCATGCAGTTTTCCACCCA LTSMQMQLYLQQLQKVQPPTPLLPSVKVQSQPPPPpPPPPpP|p?p?ppcrphptplcssscsssrhhphhpspslhpsssisplhgpctcspcsfpp K00193:38:H3MYFBBXX:4:1101:18904:31572,K00193:38:H3MYFBBXX:4:1101:20386:3197
15 changes: 15 additions & 0 deletions tests/test_pvacfuse_generate_protein_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,18 @@ def test_input_tsv(self):
expected_output_file = os.path.join(self.test_data_dir, 'output_with_tsv.fasta')
os.unlink("{}.manufacturability.tsv".format(generate_protein_fasta_output_file.name))
self.assertTrue(cmp(generate_protein_fasta_output_file.name, expected_output_file))

def test_arriba_tsv_with_invalid_character(self):
generate_protein_fasta_input_file = os.path.join(self.test_data_dir, 'input_with_invalid_character.tsv')
generate_protein_fasta_output_file = tempfile.NamedTemporaryFile()

self.assertFalse(call([
self.python,
self.executable,
generate_protein_fasta_input_file,
self.flanking_sequence_length,
generate_protein_fasta_output_file.name,
'-d', 'full'
], shell=False))
os.unlink("{}.manufacturability.tsv".format(generate_protein_fasta_output_file.name))
self.assertEqual(os.path.getsize(generate_protein_fasta_output_file.name), 0)

0 comments on commit 4f83901

Please sign in to comment.