griffithlab · susannasiebert · Oct 1, 2024 · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/docs/pvacvector/output_files.rst b/docs/pvacvector/output_files.rst
@@ -5,17 +5,38 @@
 Output Files
 ============
 
-============================== ===========
-File Name                      Description
-============================== ===========
-``vector_input.fa`` (optional) An intermediate file with vaccine peptide sequences created from the epitopes in a pVACseq output file.
-``<sample_name>_results.fa``   The final output file with the peptide sequences and best spacers in the optimal order.
-``vector.jpg``                 A JPEG visualization of the above result.
-============================== ===========
+The pVACseq pipeline will write its results in separate folders depending on
+which prediction algorithms were chosen:
+
+- ``MHC_Class_I``: for MHC class I prediction algorithms
+- ``MHC_Class_II``: for MHC class II prediction algorithms
+- ``combined``: If both MHC class I and MHC class II prediction algorithms were run, this folder combines the neoepitope predictions from both
+
+Each folder will contain the same list of output files (listed in the order
+created):
+
+.. list-table::
+   :header-rows: 1
+
+   * - File Name
+     - Description
+   * - ``vector_input.fa`` (optional)
+     - An intermediate file with vaccine peptide sequences created from the epitopes in a pVACseq output file.
+   * - 0...n (directory)
+     - One numbered directory for each iteration of clipping necessary. Each
+       one contains subdirectories for the spacers tested which in turn contain
+       prediction information and a ``junctions.tsv`` intermediate result file for the attempt iteration.
+   * - ``<sample_name>_results.fa``
+     - The final output file with the peptide sequences and best spacers in the optimal order.
+   * - ``junctions.tsv``
+     - A tab-separated file listing all of the valid junctions found by pVACvector including spacer and clipping information.
+   * - ``vector.jpg``
+     - A JPEG visualization of the above result.
+   * - ``<sample_name>_results.dna.fa``
+     - The final output file with the backtranslated DNA sequences of the included peptides and best spacers in the optimal order.
 
 .. figure:: ../images/vector.jpg
    :align: center
    :alt: pVACvector result visualization example
 
    pVACvector result visualization example
-
diff --git a/pvactools/lib/fasta_generator.py b/pvactools/lib/fasta_generator.py
@@ -357,44 +357,37 @@ def __init__(self, **kwargs):
         self.input_file         = kwargs['input_file']
         self.output_file_prefix = kwargs['output_file_prefix']
         self.epitope_lengths    = kwargs['epitope_lengths']
-        self.spacers            = kwargs['spacers']
+        self.spacer             = kwargs['spacer']
+        self.junctions_to_test  = kwargs['junctions_to_test']
+        self.clip_length        = kwargs['clip_length']
+        self.output_files = []
 
     def execute(self):
         seq_dict = dict()
         for record in SeqIO.parse(self.input_file, "fasta"):
-            data = {'seq': str(record.seq)}
-            if record.id != record.description:
-                data.update(json.loads(record.description.split(' ', 1)[1]))
-                contains_problematic_peptides = True
-            else:
-                contains_problematic_peptides = False
-            seq_dict[record.id] = data
-        seq_keys = sorted(seq_dict)
-
-        if contains_problematic_peptides:
-            seq_tuples = self.combine_problematic_peptides(seq_dict)
-        else:
-            seq_tuples = list(itertools.permutations(seq_keys, 2))
+            seq_dict[record.id] = str(record.seq)
 
         for length in self.epitope_lengths:
             epitopes = dict()
             fasta_sequences = OrderedDict()
             wingspan_length = length - 1
-            for comb in seq_tuples:
-                seq1 = comb[0]
-                seq2 = comb[1]
-                seq1_seq = seq_dict[seq1]['seq']
-                seq2_seq = seq_dict[seq2]['seq']
-                trunc_seq1 = seq1_seq[(len(seq1_seq) - wingspan_length):len(seq1_seq)]
-                trunc_seq2 = seq2_seq[0:wingspan_length]
-
-                for this_spacer in self.spacers:
-                    if this_spacer != 'None':
-                        seq_ID = seq1 + "|" + this_spacer + "|" + seq2
-                        epitopes[seq_ID] = (trunc_seq1 + this_spacer + trunc_seq2)
-                    else:
-                        seq_ID = seq1 + "|" + seq2
-                        epitopes[seq_ID] = trunc_seq1 + trunc_seq2
+            for (seq1, seq2) in self.junctions_to_test:
+                seq1_seq = seq_dict[seq1]
+                seq2_seq = seq_dict[seq2]
+                for left_clip_length in range(0, self.clip_length+1):
+                    for right_clip_length in range(0, self.clip_length+1):
+                        #These combinations would've already been tested in previous attempts with lower clip lengths and can be skipped
+                        if left_clip_length < self.clip_length and right_clip_length < self.clip_length:
+                            continue
+                        trunc_seq1 = seq1_seq[(len(seq1_seq) - wingspan_length):(len(seq1_seq) - left_clip_length)]
+                        trunc_seq2 = seq2_seq[(0 + right_clip_length):wingspan_length]
+
+                        if self.spacer != 'None':
+                            seq_ID = "{}|{}|{}|{}|{}".format(seq1, left_clip_length, self.spacer, right_clip_length, seq2)
+                            epitopes[seq_ID] = (trunc_seq1 + self.spacer + trunc_seq2)
+                        else:
+                            seq_ID = "{}|{}|{}|{}".format(seq1, left_clip_length, right_clip_length, seq2)
+                            epitopes[seq_ID] = trunc_seq1 + trunc_seq2
 
             for seq_id in epitopes:
                 sequence = epitopes[seq_id]
@@ -403,6 +396,7 @@ def execute(self):
                 fasta_sequences.setdefault(sequence, []).append(seq_id)
 
             output_file = "{}.{}.tsv".format(self.output_file_prefix, length)
+            self.output_files.append(output_file)
             output_key_file = "{}.key".format(output_file)
             writer = open(output_file, 'w')
             key_writer = open(output_key_file, 'w')

diff --git a/pvactools/lib/pipeline.py b/pvactools/lib/pipeline.py
@@ -252,10 +252,14 @@ def generate_fasta(self, chunks):
                 generate_fasta_params['input_file'] = self.tsv_file_path()
                 generate_fasta_params['output_file_prefix'] = split_fasta_file_path
                 generate_fasta_params['epitope_lengths'] = self.epitope_lengths
-                generate_fasta_params['spacers'] = self.spacers
+                generate_fasta_params['junctions_to_test'] = self.junctions_to_test
+                generate_fasta_params['spacer'] = self.spacer
+                generate_fasta_params['clip_length'] = self.clip_length
                 status_message("Generating Variant Peptide FASTA and Key Files - Entries %s" % (fasta_chunk))
                 fasta_generator = self.fasta_generator(generate_fasta_params)
                 fasta_generator.execute()
+                for file_name in fasta_generator.output_files:
+                    shutil.copy(file_name, self.output_dir)
             else:
                 for epitope_length in self.epitope_lengths:
                     split_fasta_file_path = "{}_{}".format(self.split_fasta_basename(epitope_length), fasta_chunk)