7
7
import pandas as pd
8
8
9
9
import snps
10
+ from snps .constants import REFERENCE_SEQUENCE_CHROMS
10
11
from snps .io import get_empty_snps_dataframe
11
12
from snps .utils import clean_str , get_utc_now , save_df_as_csv
12
13
@@ -120,14 +121,14 @@ def _write_csv(self):
120
121
121
122
filename = f"{ clean_str (self ._snps .source )} _{ self ._snps .assembly } { ext } "
122
123
123
- comment = (
124
- f"# Source(s): { self ._snps .source } \n "
125
- f"# Build: { self ._snps .build } \n "
126
- f"# Build Detected: { self ._snps .build_detected } \n "
127
- f"# Phased: { self ._snps .phased } \n "
128
- f"# SNPs: { self ._snps .count } \n "
129
- f"# Chromosomes: { self ._snps .chromosomes_summary } \n "
130
- )
124
+ comment = [
125
+ f"# Source(s): { self ._snps .source } " ,
126
+ f"# Build: { self ._snps .build } " ,
127
+ f"# Build Detected: { self ._snps .build_detected } " ,
128
+ f"# Phased: { self ._snps .phased } " ,
129
+ f"# SNPs: { self ._snps .count } " ,
130
+ f"# Chromosomes: { self ._snps .chromosomes_summary } " ,
131
+ ]
131
132
if "header" in self ._kwargs :
132
133
if isinstance (self ._kwargs ["header" ], bool ):
133
134
if self ._kwargs ["header" ]:
@@ -139,7 +140,7 @@ def _write_csv(self):
139
140
self ._snps ._snps ,
140
141
self ._snps ._output_dir ,
141
142
filename ,
142
- comment = comment ,
143
+ comment = " \n " . join ( comment ) + " \n " ,
143
144
atomic = self ._atomic ,
144
145
** self ._kwargs ,
145
146
)
@@ -149,8 +150,8 @@ def _write_vcf(self):
149
150
150
151
References
151
152
----------
152
- 1. The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019 ,
153
- https://samtools.github.io/hts-specs/VCFv4.2 .pdf
153
+ 1. The Variant Call Format (VCF) Version 4.3 Specification, 27 Nov 2022 ,
154
+ https://samtools.github.io/hts-specs/VCFv4.3 .pdf
154
155
155
156
Returns
156
157
-------
@@ -163,61 +164,37 @@ def _write_vcf(self):
163
164
if not filename :
164
165
filename = f"{ clean_str (self ._snps .source )} _{ self ._snps .assembly } { '.vcf' } "
165
166
166
- comment = (
167
- f"##fileformat=VCFv4.2\n "
168
- f'##fileDate={ get_utc_now ().strftime ("%Y%m%d" )} \n '
169
- f'##source="{ self ._snps .source } ; snps v{ snps .__version__ } ; https://pypi.org/project/snps/"\n '
170
- )
167
+ comment = [
168
+ "##fileformat=VCFv4.3" ,
169
+ f'##fileDate={ get_utc_now ().strftime ("%Y%m%d" )} ' ,
170
+ f'##source="snps v{ snps .__version__ } ; https://pypi.org/project/snps/"' ,
171
+ f'##detectedCompany="{ self ._snps .source } "' ,
172
+ ]
171
173
172
- reference_sequence_chroms = (
173
- "1" ,
174
- "2" ,
175
- "3" ,
176
- "4" ,
177
- "5" ,
178
- "6" ,
179
- "7" ,
180
- "8" ,
181
- "9" ,
182
- "10" ,
183
- "11" ,
184
- "12" ,
185
- "13" ,
186
- "14" ,
187
- "15" ,
188
- "16" ,
189
- "17" ,
190
- "18" ,
191
- "19" ,
192
- "20" ,
193
- "21" ,
194
- "22" ,
195
- "X" ,
196
- "Y" ,
197
- "MT" ,
198
- )
174
+ if self ._snps .build_original :
175
+ comment .append (f"##detectedOriginalBuild={ self ._snps .build_original } " )
176
+
177
+ if self ._snps .determine_sex ():
178
+ comment .append (f"##detectedSex={ self ._snps .determine_sex ()} " )
179
+
180
+ if self ._vcf_qc_only or self ._vcf_qc_filter :
181
+ chip_version = ""
182
+ if self ._snps .chip_version :
183
+ chip_version = f" { self ._snps .chip_version } "
184
+
185
+ if self ._snps .chip :
186
+ comment .append (
187
+ f'##detectedChip="{ self ._snps .chip } { chip_version } per Lu et al.: https://doi.org/10.1016/j.csbj.2021.06.040"'
188
+ )
199
189
200
190
df = self ._snps .snps
201
191
202
192
p = self ._snps ._parallelizer
203
193
tasks = []
204
194
205
- # skip insertions and deletions
206
- df = df .drop (
207
- df .loc [
208
- df ["genotype" ].notnull ()
209
- & (
210
- (df ["genotype" ].str [0 ] == "I" )
211
- | (df ["genotype" ].str [0 ] == "D" )
212
- | (df ["genotype" ].str [1 ] == "I" )
213
- | (df ["genotype" ].str [1 ] == "D" )
214
- )
215
- ].index
216
- )
217
-
218
195
chroms_to_drop = []
219
196
for chrom in df ["chrom" ].unique ():
220
- if chrom not in reference_sequence_chroms :
197
+ if chrom not in REFERENCE_SEQUENCE_CHROMS :
221
198
chroms_to_drop .append (chrom )
222
199
continue
223
200
@@ -237,41 +214,66 @@ def _write_vcf(self):
237
214
if self ._vcf_qc_only or self ._vcf_qc_filter
238
215
else get_empty_snps_dataframe ()
239
216
),
217
+ "sex" : self ._snps .determine_sex (),
240
218
}
241
219
)
242
220
243
221
# drop chromosomes without reference sequence data (e.g., unassigned PAR)
244
222
for chrom in chroms_to_drop :
245
223
df = df .drop (df .loc [df ["chrom" ] == chrom ].index )
246
224
225
+ # Check for the presence of insertions or deletions
226
+ has_ins = df ["genotype" ].str .contains ("I" , na = False ).any ()
227
+ has_del = df ["genotype" ].str .contains ("D" , na = False ).any ()
228
+
247
229
# create the VCF representation for SNPs
248
230
results = p (self ._create_vcf_representation , tasks )
249
231
250
232
contigs = []
251
233
vcf = [pd .DataFrame ()]
252
234
discrepant_vcf_position = [pd .DataFrame ()]
253
235
for result in list (results ):
254
- contigs .append (result ["contig" ])
236
+ if result ["contig" ]:
237
+ contigs .append (result ["contig" ])
255
238
vcf .append (result ["vcf" ])
256
239
discrepant_vcf_position .append (result ["discrepant_vcf_position" ])
257
240
258
241
vcf = pd .concat (vcf )
259
242
discrepant_vcf_position = pd .concat (discrepant_vcf_position )
260
243
261
- comment += "" .join (contigs )
244
+ comment .extend (contigs )
245
+
246
+ if has_del :
247
+ comment .append (
248
+ '##ALT=<ID=DEL,Description="Deletion relative to the reference">'
249
+ )
250
+ if has_ins :
251
+ comment .append (
252
+ '##ALT=<ID=INS,Description="Insertion of novel sequence relative to the reference">'
253
+ )
254
+
255
+ if has_ins or has_del :
256
+ comment .append (
257
+ '##INFO=<ID=SVTYPE,Number=.,Type=String,Description="Type of structural variant: INS (Insertion), DEL (Deletion)">'
258
+ )
259
+ comment .append (
260
+ '##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">'
261
+ )
262
262
263
263
if self ._vcf_qc_filter and self ._snps .cluster :
264
- comment += '##FILTER=<ID=lq,Description="Low quality SNP per Lu et al.: https://doi.org/10.1016/j.csbj.2021.06.040">\n '
264
+ comment .append (
265
+ '##FILTER=<ID=lq,Description="Low quality SNP per Lu et al.: https://doi.org/10.1016/j.csbj.2021.06.040">'
266
+ )
265
267
266
- comment += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n '
267
- comment += "#CHROM\t POS\t ID\t REF\t ALT\t QUAL\t FILTER\t INFO\t FORMAT\t SAMPLE\n "
268
+ comment . append ( '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' )
269
+ comment . append ( "#CHROM\t POS\t ID\t REF\t ALT\t QUAL\t FILTER\t INFO\t FORMAT\t SAMPLE" )
268
270
269
271
return (
270
272
save_df_as_csv (
271
273
vcf ,
272
274
self ._snps ._output_dir ,
273
275
filename ,
274
- comment = comment ,
276
+ comment = " \n " . join ( comment ) + " \n " ,
275
277
prepend_info = False ,
276
278
header = False ,
277
279
index = False ,
@@ -288,6 +290,7 @@ def _create_vcf_representation(self, task):
288
290
snps = task ["snps" ]
289
291
cluster = task ["cluster" ]
290
292
low_quality_snps = task ["low_quality_snps" ]
293
+ sex = task ["sex" ]
291
294
292
295
if len (snps .loc [snps ["genotype" ].notnull ()]) == 0 :
293
296
return {
@@ -299,7 +302,7 @@ def _create_vcf_representation(self, task):
299
302
seqs = resources .get_reference_sequences (assembly , [chrom ])
300
303
seq = seqs [chrom ]
301
304
302
- contig = f'##contig=<ID={ seq .ID } ,URL={ seq .url } ,length={ seq .length } ,assembly={ seq .build } ,md5={ seq .md5 } ,species="{ seq .species } ">\n '
305
+ contig = f'##contig=<ID={ self . _vcf_chrom_prefix } { seq .ID } ,URL={ seq .url } ,length={ seq .length } ,assembly={ seq .build } ,md5={ seq .md5 } ,species="{ seq .species } ">'
303
306
304
307
if self ._vcf_qc_only and cluster :
305
308
# drop low quality SNPs if SNPs object maps to a cluster
@@ -371,12 +374,27 @@ def _create_vcf_representation(self, task):
371
374
temp ["REF" ], temp ["genotype" ]
372
375
)
373
376
377
+ # Populate INFO field
378
+ df ["INFO" ] = df ["ALT" ].apply (self ._compute_info )
379
+
374
380
temp = df .loc [df ["genotype" ].notnull ()]
375
381
376
382
df .loc [df ["genotype" ].notnull (), "SAMPLE" ] = np .vectorize (
377
383
self ._compute_genotype
378
384
)(temp ["REF" ], temp ["ALT" ], temp ["genotype" ])
379
385
386
+ if sex == "Female" :
387
+ haploid_chroms = ["Y" , "MT" ]
388
+ else :
389
+ haploid_chroms = ["X" , "Y" , "MT" ]
390
+
391
+ # populate null values for haploid chromosomes
392
+ df .loc [
393
+ (df ["SAMPLE" ].isnull ())
394
+ & (df ["CHROM" ].str .contains ("|" .join (haploid_chroms ))),
395
+ "SAMPLE" ,
396
+ ] = "."
397
+
380
398
df .loc [df ["SAMPLE" ].isnull (), "SAMPLE" ] = "./."
381
399
382
400
del df ["genotype" ]
@@ -387,9 +405,18 @@ def _create_vcf_representation(self, task):
387
405
"discrepant_vcf_position" : discrepant_vcf_position ,
388
406
}
389
407
408
+ def _replace_genotype_indels (self , genotype ):
409
+ # Replace 'I' and 'D' with '<INS>' and '<DEL>'
410
+ return [
411
+ "<INS>" if allele == "I" else "<DEL>" if allele == "D" else allele
412
+ for allele in genotype
413
+ ]
414
+
390
415
def _compute_alt (self , ref , genotype ):
391
416
genotype_alleles = list (set (genotype ))
392
417
418
+ genotype_alleles = self ._replace_genotype_indels (genotype_alleles )
419
+
393
420
if ref in genotype_alleles :
394
421
if len (genotype_alleles ) == 1 :
395
422
return self ._vcf_alt_unavailable
@@ -401,6 +428,10 @@ def _compute_alt(self, ref, genotype):
401
428
return "," .join (genotype_alleles )
402
429
403
430
def _compute_genotype (self , ref , alt , genotype ):
431
+ genotype = list (genotype )
432
+
433
+ genotype = self ._replace_genotype_indels (genotype )
434
+
404
435
alleles = [ref ]
405
436
406
437
if self ._snps .phased :
@@ -417,3 +448,22 @@ def _compute_genotype(self, ref, alt, genotype):
417
448
)
418
449
else :
419
450
return f"{ alleles .index (genotype [0 ])} "
451
+
452
+ def _compute_info (self , alt ):
453
+ """Generate the INFO field based on ALT values."""
454
+ if pd .isna (alt ):
455
+ return "."
456
+
457
+ alt_values = alt .split ("," )
458
+ svtypes = []
459
+ for alt_value in alt_values :
460
+ if alt_value == "<INS>" :
461
+ svtypes .append ("INS" )
462
+ elif alt_value == "<DEL>" :
463
+ svtypes .append ("DEL" )
464
+
465
+ if not svtypes :
466
+ return "."
467
+
468
+ svtype_str = "," .join (svtypes )
469
+ return f"SVTYPE={ svtype_str } ;IMPRECISE" if svtype_str else "."
0 commit comments