Merge remote-tracking branch 'origin/hotfix'

griffithlab · Jul 14, 2023 · e2f0ea5 · e2f0ea5
2 parents 05c9c5c + 86ecee5
commit e2f0ea5
Show file tree

Hide file tree

Showing 23 changed files with 166,424 additions and 69,560 deletions.
diff --git a/HCC1395_inputs.zip b/HCC1395_inputs.zip
diff --git a/docs/conf.py b/docs/conf.py
@@ -70,7 +70,7 @@
 # The short X.Y version.
 version = '4.0'
 # The full version, including alpha/beta/rc tags.
-release = '4.0.0'
+release = '4.0.1'
 
 
 # The language for content autogenerated by Sphinx. Refer to documentation

diff --git a/docs/index.rst b/docs/index.rst
@@ -51,6 +51,17 @@ Contents
    contact
    mailing_list
 
+New in Release |release|
+------------------------
+
+This is a bugfix release. It fixes the following problem(s):
+
+- It fixes errors for a few edge cases when determining the mutation
+  position(s).
+- Update the HCC1395 demo date for pVACview to include elution data.
+- Correctly set NA columns in pVACview export dataframe.
+- Handle Arriba files with empty peptide_sequence fields.
+
 New in Version |version|
 ------------------------
 

diff --git a/docs/releases/4_0.rst b/docs/releases/4_0.rst
@@ -64,3 +64,14 @@ _____________
   the filtered report.
 - A new parameter ``--aggregate-inclusion-binding-threshold`` controls which
   epitope candidates are included in the aggregate report.
+
+New in Version 4.0.1
+--------------------
+
+This is a bugfix release. It fixes the following problem(s):
+
+- It fixes errors for a few edge cases when determining the mutation
+  position(s).
+- Update the HCC1395 demo date for pVACview to include elution data.
+- Correctly set NA columns in pVACview export dataframe.
+- Handle Arriba files with empty peptide_sequence fields.
diff --git a/pvactools/lib/calculate_reference_proteome_similarity.py b/pvactools/lib/calculate_reference_proteome_similarity.py
@@ -294,7 +294,16 @@ def _get_peptide(self, line, mt_records_dict, wt_records_dict):
                 epitope = line['Best Peptide']
                 (full_peptide, wt_peptide, variant_type, mt_amino_acids, wt_amino_acids) = self._get_full_peptide(line, mt_records_dict, wt_records_dict)
                 if variant_type != 'FS':
-                    mt_pos = int(line['Pos'].split('-')[0])
+                    if line['Pos'] == 'NA':
+                        mt_pos = None
+                        for i,(wt_aa,mt_aa) in enumerate(zip(wt_peptide,full_peptide)):
+                            if wt_aa != mt_aa:
+                                mt_pos = i
+                                break
+                        if mt_pos is None:
+                            return None, full_peptide
+                    else:
+                        mt_pos = int(line['Pos'].split('-')[0])
             else:
                 epitope = line['MT Epitope Seq']
                 full_peptide = mt_records_dict[line['Index']]
@@ -453,6 +462,20 @@ def _write_outputs(self, processed_peptides, mt_records_dict, wt_records_dict):
                 peptide, full_peptide = self._get_peptide(line, mt_records_dict, wt_records_dict)
 
                 if self.peptide_fasta:
+                    if peptide is None:
+                        if self._input_tsv_type(line) == 'aggregated':
+                            line['Ref Match'] = 'Not Run'
+                            if self.aggregate_metrics_file:
+                                self.aggregate_metrics[line['ID']]['reference_matches'] = {
+                                    'count': 0,
+                                    'query_peptide': peptide,
+                                    'matches': []
+                                }
+                        else:
+                            line['Reference Match'] = 'Not Run'
+                        writer.writerow(line)
+                        continue
+
                     results = processed_peptides[peptide]
                 else:
                     results = processed_peptides[full_peptide]
@@ -553,7 +576,8 @@ def _get_unique_peptides(self, mt_records_dict, wt_records_dict):
             for line in reader:
                 peptide, full_peptide = self._get_peptide(line, mt_records_dict, wt_records_dict)
                 if self.peptide_fasta:
-                    unique_peptides.add(peptide)
+                    if peptide is not None:
+                        unique_peptides.add(peptide)
                 else:
                     unique_peptides.add(full_peptide)
 

diff --git a/pvactools/lib/input_file_converter.py b/pvactools/lib/input_file_converter.py
@@ -521,7 +521,7 @@ def parse_arriba_file(self, starfusion_entries):
             for record in reader:
                 (five_prime_chr, five_prime_start) = record['breakpoint1'].split(':')
                 (three_prime_chr, three_prime_start) = record['breakpoint2'].split(':')
-                if record['peptide_sequence'] == '.':
+                if record['peptide_sequence'] == '.' or record['peptide_sequence'] is None or record['peptide_sequence'] == "":
                     continue
                 (fusion_position, fusion_amino_acid_sequence) = self.determine_fusion_sequence(record['peptide_sequence'], '|')
                 gene_name = "{}-{}".format(record['#gene1'], record['gene2'])

diff --git a/pvactools/lib/output_parser.py b/pvactools/lib/output_parser.py
@@ -265,7 +265,7 @@ def match_wildtype_and_mutant_entry_for_frameshift(self, result, mt_position, wt
                 result['wt_percentiles'] = self.format_match_na(result, 'percentile')
             mutation_position = self.find_mutation_position(wt_epitope_seq, mt_epitope_seq)
             if mutation_position == peptide_length:
-                result['mutation_position'] = mutation_position
+                result['mutation_position'] = '{}'.format(mutation_position)
             else:
                 result['mutation_position'] = '{}-{}'.format(mutation_position, peptide_length)
             result['wt_epitope_position'] = match_position
@@ -277,7 +277,7 @@ def match_wildtype_and_mutant_entry_for_inframe_indel(self, result, mt_position,
             best_match_position           = previous_result['wt_epitope_position'] + 1
             result['wt_epitope_position'] = best_match_position
             result['match_direction']     = 'right'
-            result['mutation_position'] = self.determine_ins_mut_position_from_previous_result(previous_result, mt_epitope_seq, result)
+            result['mutation_position']   = self.determine_ins_mut_position_from_previous_result(previous_result, mt_epitope_seq, result)
 
             #We need to ensure that the matched WT eptiope has enough overlapping amino acids with the MT epitope
             best_match_wt_result = wt_results[str(best_match_position)]
@@ -307,7 +307,10 @@ def match_wildtype_and_mutant_entry_for_inframe_indel(self, result, mt_position,
             result['wt_percentiles'] = self.format_match_na(result, 'percentile')
             #We then infer the mutation position and match direction from the previous MT epitope
             result['match_direction'] = previous_result['match_direction']
-            result['mutation_position'] = self.determine_ins_mut_position_from_previous_result(previous_result, mt_epitope_seq, result)
+            if previous_result['mutation_position'] == 'NA' or previous_result['mutation_position'] == '1':
+                result['mutation_position'] = 'NA'
+            else:
+                result['mutation_position'] = self.determine_ins_mut_position_from_previous_result(previous_result, mt_epitope_seq, result)
             return
 
         baseline_best_match_wt_result      = wt_results[baseline_best_match_position]
@@ -368,12 +371,12 @@ def match_wildtype_and_mutant_entry_for_inframe_indel(self, result, mt_position,
             if result['variant_type'] == 'inframe_ins':
                 mutation_position = self.find_ins_mut_position(baseline_best_match_wt_epitope_seq, mt_epitope_seq, result['amino_acid_change'], match_direction)
                 if mutation_position is None:
-                    result['mutation_position'] = None
+                    result['mutation_position'] = 'NA'
                 else:
                     if previous_result is None:
                         result['mutation_position'] = '{}-{}'.format(mutation_position[0], mutation_position[1]) if len(mutation_position)==2 else '{}'.format(mutation_position[0])
                     else:
-                        if previous_result['mutation_position'] is None:
+                        if previous_result['mutation_position'] == 'NA':
                             result['mutation_position'] = '{}-{}'.format(mutation_position[0], mutation_position[1]) if len(mutation_position)==2 else '{}'.format(mutation_position[0])
                         else:
                             result['mutation_position'] = self.determine_ins_mut_position_from_previous_result(previous_result, mt_epitope_seq, result)

diff --git a/pvactools/lib/prediction_class.py b/pvactools/lib/prediction_class.py
@@ -13,6 +13,11 @@
 from Bio import SeqIO
 import random
 import uuid
+from mhcflurry.downloads import get_default_class1_presentation_models_dir
+from mhcflurry.class1_presentation_predictor import Class1PresentationPredictor
+import numpy
+
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 
 class IEDB(metaclass=ABCMeta):
     @classmethod
@@ -318,29 +323,28 @@ def predict(self, input_file, allele, epitope_length, iedb_executable_path, iedb
 
         all_epitopes = list(set(all_epitopes))
         if len(all_epitopes) > 0:
-            tmp_output_file = tempfile.NamedTemporaryFile('r', dir=tmp_dir, delete=False)
-            arguments = ["mhcflurry-predict", "--alleles", allele, "--out", tmp_output_file.name, "--peptides"]
-            arguments.extend(all_epitopes)
-            stderr_fh = tempfile.NamedTemporaryFile('w', dir=tmp_dir, delete=False)
-            try:
-                response = run(arguments, check=True, stdout=DEVNULL, stderr=stderr_fh)
-            except:
-                stderr_fh.close()
-                with open(stderr_fh.name, 'r') as fh:
-                    err = fh.read()
-                os.unlink(stderr_fh.name)
-                raise Exception("An error occurred while calling MHCflurry:\n{}".format(err))
-            stderr_fh.close()
-            os.unlink(stderr_fh.name)
-            tmp_output_file.close()
-            df = pd.read_csv(tmp_output_file.name)
-            os.unlink(tmp_output_file.name)
+            models_dir = get_default_class1_presentation_models_dir(test_exists=True)
+            predictor = Class1PresentationPredictor.load(models_dir)
+            df = predictor.predict(
+                peptides=numpy.array(all_epitopes, dtype='object'),
+                n_flanks=None,
+                c_flanks=None,
+                alleles={allele: [allele]},
+                throw=True,
+                include_affinity_percentile=True,
+                verbose=0
+            )
             df.rename(columns={
-                'mhcflurry_prediction': 'ic50',
-                'mhcflurry_affinity': 'ic50',
-                'mhcflurry_prediction_percentile': 'percentile',
-                'mhcflurry_affinity_percentile': 'percentile'
+                'prediction': 'ic50',
+                'affinity': 'ic50',
+                'prediction_percentile': 'percentile',
+                'affinity_percentile': 'percentile',
+                'processing_score': 'mhcflurry_processing_score',
+                'presentation_score': 'mhcflurry_presentation_score',
+                'presentation_percentile': 'mhcflurry_presentation_percentile',
+                'best_allele': 'allele',
             }, inplace=True)
+            df.drop(labels='peptide_num', axis=1, inplace=True)
             for record in SeqIO.parse(input_file, "fasta"):
                 seq_num = record.id
                 peptide = str(record.seq)

diff --git a/pvactools/lib/run_argument_parser.py b/pvactools/lib/run_argument_parser.py
@@ -203,7 +203,7 @@ def __init__(self):
         tool_name = "pvacseq"
         input_file_help = (
             "A VEP-annotated single- or multi-sample VCF containing genotype, transcript, "
-            "Wildtype protein sequence, and Downstream protein sequence information."
+            "Wildtype protein sequence, and Frameshift protein sequence information."
             "The VCF may be gzipped (requires tabix index)."
         )
         PredictionRunWithFastaGenerationArgumentParser.__init__(self, tool_name, input_file_help)

diff --git a/pvactools/tools/pvacseq/generate_protein_fasta.py b/pvactools/tools/pvacseq/generate_protein_fasta.py
@@ -27,7 +27,7 @@ def define_parser():
     parser.add_argument(
         "input_vcf",
         help="A VEP-annotated single- or multi-sample VCF containing genotype, transcript, "
-            +"Wildtype protein sequence, and Downstream protein sequence information."
+            +"Wildtype protein sequence, and Frameshift protein sequence information."
             +"The VCF may be gzipped (requires tabix index)."
     )
     parser.add_argument(