add: implement index by column name

FedericaBrando · FedericaBrando · commit b71f9764b0ed · 2024-05-02T14:26:07.000+02:00
diff --git a/containers_build/boostdm/features/aachange.py b/containers_build/boostdm/features/aachange.py
@@ -10,12 +10,12 @@
 def get_aachange(chr_, pos, alt, gene, reader):
 
     for data in reader.get(chr_, pos, pos):
-        alt_vep = (data[3] == alt)
-        mane_vep = (data[-5] != '-') # impose MANE transcript
-        correct_gene = (data[-9] == gene) # skip cases with antisense overlapping gene (gene is gene_symbol)
+        alt_vep = (data['ALT'] == alt)
+        mane_vep = (data['MANE_SELECT'] != '-') # impose MANE transcript
+        correct_gene = (data['SYMBOL'] == gene) # skip cases with antisense overlapping gene (gene is gene_symbol)
         if alt_vep and mane_vep and correct_gene:
-            aas = data[11]  # [11] -> amino-acids involved in change ("I/T")
-            aa_pos = data[10]  # [10] -> amino-acid position
+            aas = data['AA']  # [11] -> amino-acids involved in change ("I/T")
+            aa_pos = data['PROT_POS']  # [10] -> amino-acid position
             if '/' in aas:
                 aa_ref, aa_alt = tuple(aas.split('/'))
                 return aa_ref + aa_pos + aa_alt
diff --git a/containers_build/boostdm/features/consequence_type.py b/containers_build/boostdm/features/consequence_type.py
@@ -9,11 +9,11 @@ def get_csqn_type(chr_, pos, alt, gene, reader):
 
     for data in reader.get(chr_, pos, pos):
         
-        alt_vep = (data[3] == alt)           # same alternate allele
-        mane_vep = (data[-5] != '-')  # impose mane transcript
-        correct_gene = (data[-9] == gene)    # skip cases with antisense overlapping genes
+        alt_vep = (data['ALT'] == alt)           # same alternate allele
+        mane_vep = (data["MANE_SELECT"] != '-')  # impose mane transcript
+        correct_gene = (data["SYMBOL"] == gene)    # skip cases with antisense overlapping genes
         if alt_vep and mane_vep and correct_gene:
-            csqn = CONSEQUENCES_LIST[min([CONSEQUENCES_DICT[c] for c in data[7].split(',')])]
+            csqn = CONSEQUENCES_LIST[min([CONSEQUENCES_DICT[c] for c in data["CNSQ"].split(',')])]
             return AGGREGATION_DICT.get(csqn, None)
     
     return None
@@ -23,5 +23,5 @@ def add_feature(df):
 
     with Tabix(TABIX_FILE) as reader:
         get_from_reader = partial(get_csqn_type, reader=reader)
-        df['csqn_type'] = df.apply(lambda row: get_from_reader(str(row['chr']), int(row['pos']), row['alt'], row['gene']), axis=1)
+        df['csqn_type'] = df.apply(lambda row: get_from_reader((str(row['chr']), int(row['pos']), row['alt'], row['gene'])), axis=1)
     return df
diff --git a/containers_build/boostdm/features/exon.py b/containers_build/boostdm/features/exon.py
@@ -26,11 +26,11 @@ def nmd_rule(exon, total_exons):
 def get_exon(chr_, pos, alt,gene, reader):
 
     for data in reader.get(chr_, pos, pos):
-        alt_vep = (data[3] == alt)
-        mane_vep = (data[-5] != '-') # impose mane transcript
-        correct_gene = (data[-9] == gene)  # skip cases with antisense overlapping gene
+        alt_vep = (data["ALT"] == alt)
+        mane_vep = (data["MANE_SELECT"] != '-') # impose mane transcript
+        correct_gene = (data["SYMBOL"] == gene)  # skip cases with antisense overlapping gene
         if alt_vep and mane_vep and correct_gene:
-            exons = data[-2]
+            exons = data["EXON"]
             if '/' in exons:
                 exon, total_exons = tuple(exons.split('/'))
             else:
diff --git a/containers_build/boostdm/vepreader.py b/containers_build/boostdm/vepreader.py
@@ -5,6 +5,12 @@
 GENOME_SEQUENCE_MAPS.update({'chrX': 'X', '23': 'X', 'chr23': 'X', 'chrY': 'Y', '24': 'Y', 'chr24': 'Y'})
 GENOME_SEQUENCE_MAPS.update({'chrM': 'M', 'MT': 'M', 'chrMT': 'M'})
 
+HEADER = [    
+    'CHR', 'POS', 'REF', 'ALT', 'GENE','ENST','TYPE','CNSQ','cDNA_POS',
+    'CDS_POS', 'PROT_POS','AA','CODONS','EXISTING_VARIATION','IMPACT','DISTANCE','STRAND','FLAGS','SYMBOL',
+    'SYMBOL_SOURCE','HGNC_ID','CANONICAL','MANE_SELECT','MANE_PLUS_CLINICAL','ENSP','EXON','INTRON'
+    ]
+
 
 class Tabix:
 
@@ -22,6 +28,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
     def get(self, chromosome, start, stop):
         chr_ = self.map.get(chromosome, chromosome)
-        for row in self.tb.query("{}".format(chr_), start, stop):
-            yield row
+        for row in self.tb.query('{}'.format(chr_), start, stop):
+            row_dict = dict(zip(HEADER, row))
+            yield row_dict