Skip to content

Commit c702fa6

Browse files
authored
Merge pull request #184 from sanogenetics/feature/enhance-vcf-output
Enhance VCF output
2 parents 162d86e + eaf5824 commit c702fa6

File tree

7 files changed

+268
-102
lines changed

7 files changed

+268
-102
lines changed

src/snps/constants.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
REFERENCE_SEQUENCE_CHROMS = tuple(str(i) for i in range(1, 23)) + ("X", "Y", "MT")

src/snps/io/reader.py

+20-5
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def read(self):
163163
elif re.match("^rs[0-9]*[, \t]{1}[1]", first_line):
164164
d = self.read_generic(file, compression, skip=0)
165165
elif "vcf" in comments.lower() or "##contig" in comments.lower():
166-
d = self.read_vcf(file, compression, "vcf", self._rsids)
166+
d = self.read_vcf(file, compression, "vcf", self._rsids, comments)
167167
elif ("Genes for Good" in comments) | ("PLINK" in comments):
168168
d = self.read_genes_for_good(file, compression)
169169
elif "DNA.Land" in comments:
@@ -1355,7 +1355,7 @@ def parse(sep):
13551355

13561356
return self.read_helper("generic", parser)
13571357

1358-
def read_vcf(self, file, compression, provider, rsids=()):
1358+
def read_vcf(self, file, compression, provider, rsids=(), comments=""):
13591359
"""Read and parse VCF file.
13601360
13611361
Notes
@@ -1366,7 +1366,7 @@ def read_vcf(self, file, compression, provider, rsids=()):
13661366
* SNPs that are not annotated with an RSID are skipped
13671367
* If the VCF contains multiple samples, only the first sample is used to
13681368
lookup the genotype
1369-
* Insertions and deletions are skipped
1369+
* Precise insertions and deletions are skipped
13701370
* If a sample allele is not specified, the genotype is reported as NaN
13711371
* If a sample allele refers to a REF or ALT allele that is not specified,
13721372
the genotype is reported as NaN
@@ -1393,6 +1393,17 @@ def parser():
13931393

13941394
return (df, phased)
13951395

1396+
header_lines = comments.replace("\r\n", "\n").split("\n")
1397+
1398+
detected_company = ""
1399+
for line in header_lines:
1400+
if line.startswith("##detectedCompany="):
1401+
detected_company = line.split("=")[1].strip('"')
1402+
break
1403+
1404+
if detected_company:
1405+
provider = detected_company
1406+
13961407
return self.read_helper(provider, parser)
13971408

13981409
def _parse_vcf(self, buffer, rsids):
@@ -1441,9 +1452,13 @@ def _parse_vcf(self, buffer, rsids):
14411452
if len(alt.split(",")) > 1 and alt.split(",")[1] == "<NON_REF>":
14421453
alt = alt.split(",")[0]
14431454

1444-
ref_alt = [ref] + alt.split(",")
1455+
# Handle <INS> and <DEL> alleles (imprecise insertions and deletions)
1456+
ref_alt = [ref] + [
1457+
"I" if allele == "<INS>" else "D" if allele == "<DEL>" else allele
1458+
for allele in alt.split(",")
1459+
]
14451460

1446-
# skip insertions and deletions
1461+
# skip precise insertions and deletions
14471462
if sum(map(len, ref_alt)) > len(ref_alt):
14481463
continue
14491464

src/snps/io/writer.py

+114-64
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import pandas as pd
88

99
import snps
10+
from snps.constants import REFERENCE_SEQUENCE_CHROMS
1011
from snps.io import get_empty_snps_dataframe
1112
from snps.utils import clean_str, get_utc_now, save_df_as_csv
1213

@@ -120,14 +121,14 @@ def _write_csv(self):
120121

121122
filename = f"{clean_str(self._snps.source)}_{self._snps.assembly}{ext}"
122123

123-
comment = (
124-
f"# Source(s): {self._snps.source}\n"
125-
f"# Build: {self._snps.build}\n"
126-
f"# Build Detected: { self._snps.build_detected}\n"
127-
f"# Phased: {self._snps.phased}\n"
128-
f"# SNPs: {self._snps.count}\n"
129-
f"# Chromosomes: {self._snps.chromosomes_summary}\n"
130-
)
124+
comment = [
125+
f"# Source(s): {self._snps.source}",
126+
f"# Build: {self._snps.build}",
127+
f"# Build Detected: { self._snps.build_detected}",
128+
f"# Phased: {self._snps.phased}",
129+
f"# SNPs: {self._snps.count}",
130+
f"# Chromosomes: {self._snps.chromosomes_summary}",
131+
]
131132
if "header" in self._kwargs:
132133
if isinstance(self._kwargs["header"], bool):
133134
if self._kwargs["header"]:
@@ -139,7 +140,7 @@ def _write_csv(self):
139140
self._snps._snps,
140141
self._snps._output_dir,
141142
filename,
142-
comment=comment,
143+
comment="\n".join(comment) + "\n",
143144
atomic=self._atomic,
144145
**self._kwargs,
145146
)
@@ -149,8 +150,8 @@ def _write_vcf(self):
149150
150151
References
151152
----------
152-
1. The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019,
153-
https://samtools.github.io/hts-specs/VCFv4.2.pdf
153+
1. The Variant Call Format (VCF) Version 4.3 Specification, 27 Nov 2022,
154+
https://samtools.github.io/hts-specs/VCFv4.3.pdf
154155
155156
Returns
156157
-------
@@ -163,61 +164,37 @@ def _write_vcf(self):
163164
if not filename:
164165
filename = f"{clean_str(self._snps.source)}_{self._snps.assembly}{'.vcf'}"
165166

166-
comment = (
167-
f"##fileformat=VCFv4.2\n"
168-
f'##fileDate={get_utc_now().strftime("%Y%m%d")}\n'
169-
f'##source="{self._snps.source}; snps v{snps.__version__}; https://pypi.org/project/snps/"\n'
170-
)
167+
comment = [
168+
"##fileformat=VCFv4.3",
169+
f'##fileDate={get_utc_now().strftime("%Y%m%d")}',
170+
f'##source="snps v{snps.__version__}; https://pypi.org/project/snps/"',
171+
f'##detectedCompany="{self._snps.source}"',
172+
]
171173

172-
reference_sequence_chroms = (
173-
"1",
174-
"2",
175-
"3",
176-
"4",
177-
"5",
178-
"6",
179-
"7",
180-
"8",
181-
"9",
182-
"10",
183-
"11",
184-
"12",
185-
"13",
186-
"14",
187-
"15",
188-
"16",
189-
"17",
190-
"18",
191-
"19",
192-
"20",
193-
"21",
194-
"22",
195-
"X",
196-
"Y",
197-
"MT",
198-
)
174+
if self._snps.build_original:
175+
comment.append(f"##detectedOriginalBuild={self._snps.build_original}")
176+
177+
if self._snps.determine_sex():
178+
comment.append(f"##detectedSex={self._snps.determine_sex()}")
179+
180+
if self._vcf_qc_only or self._vcf_qc_filter:
181+
chip_version = ""
182+
if self._snps.chip_version:
183+
chip_version = f" {self._snps.chip_version}"
184+
185+
if self._snps.chip:
186+
comment.append(
187+
f'##detectedChip="{self._snps.chip}{chip_version} per Lu et al.: https://doi.org/10.1016/j.csbj.2021.06.040"'
188+
)
199189

200190
df = self._snps.snps
201191

202192
p = self._snps._parallelizer
203193
tasks = []
204194

205-
# skip insertions and deletions
206-
df = df.drop(
207-
df.loc[
208-
df["genotype"].notnull()
209-
& (
210-
(df["genotype"].str[0] == "I")
211-
| (df["genotype"].str[0] == "D")
212-
| (df["genotype"].str[1] == "I")
213-
| (df["genotype"].str[1] == "D")
214-
)
215-
].index
216-
)
217-
218195
chroms_to_drop = []
219196
for chrom in df["chrom"].unique():
220-
if chrom not in reference_sequence_chroms:
197+
if chrom not in REFERENCE_SEQUENCE_CHROMS:
221198
chroms_to_drop.append(chrom)
222199
continue
223200

@@ -237,41 +214,66 @@ def _write_vcf(self):
237214
if self._vcf_qc_only or self._vcf_qc_filter
238215
else get_empty_snps_dataframe()
239216
),
217+
"sex": self._snps.determine_sex(),
240218
}
241219
)
242220

243221
# drop chromosomes without reference sequence data (e.g., unassigned PAR)
244222
for chrom in chroms_to_drop:
245223
df = df.drop(df.loc[df["chrom"] == chrom].index)
246224

225+
# Check for the presence of insertions or deletions
226+
has_ins = df["genotype"].str.contains("I", na=False).any()
227+
has_del = df["genotype"].str.contains("D", na=False).any()
228+
247229
# create the VCF representation for SNPs
248230
results = p(self._create_vcf_representation, tasks)
249231

250232
contigs = []
251233
vcf = [pd.DataFrame()]
252234
discrepant_vcf_position = [pd.DataFrame()]
253235
for result in list(results):
254-
contigs.append(result["contig"])
236+
if result["contig"]:
237+
contigs.append(result["contig"])
255238
vcf.append(result["vcf"])
256239
discrepant_vcf_position.append(result["discrepant_vcf_position"])
257240

258241
vcf = pd.concat(vcf)
259242
discrepant_vcf_position = pd.concat(discrepant_vcf_position)
260243

261-
comment += "".join(contigs)
244+
comment.extend(contigs)
245+
246+
if has_del:
247+
comment.append(
248+
'##ALT=<ID=DEL,Description="Deletion relative to the reference">'
249+
)
250+
if has_ins:
251+
comment.append(
252+
'##ALT=<ID=INS,Description="Insertion of novel sequence relative to the reference">'
253+
)
254+
255+
if has_ins or has_del:
256+
comment.append(
257+
'##INFO=<ID=SVTYPE,Number=.,Type=String,Description="Type of structural variant: INS (Insertion), DEL (Deletion)">'
258+
)
259+
comment.append(
260+
'##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">'
261+
)
262262

263263
if self._vcf_qc_filter and self._snps.cluster:
264-
comment += '##FILTER=<ID=lq,Description="Low quality SNP per Lu et al.: https://doi.org/10.1016/j.csbj.2021.06.040">\n'
264+
comment.append(
265+
'##FILTER=<ID=lq,Description="Low quality SNP per Lu et al.: https://doi.org/10.1016/j.csbj.2021.06.040">'
266+
)
265267

266-
comment += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
267-
comment += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"
268+
comment.append('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">')
269+
comment.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE")
268270

269271
return (
270272
save_df_as_csv(
271273
vcf,
272274
self._snps._output_dir,
273275
filename,
274-
comment=comment,
276+
comment="\n".join(comment) + "\n",
275277
prepend_info=False,
276278
header=False,
277279
index=False,
@@ -288,6 +290,7 @@ def _create_vcf_representation(self, task):
288290
snps = task["snps"]
289291
cluster = task["cluster"]
290292
low_quality_snps = task["low_quality_snps"]
293+
sex = task["sex"]
291294

292295
if len(snps.loc[snps["genotype"].notnull()]) == 0:
293296
return {
@@ -299,7 +302,7 @@ def _create_vcf_representation(self, task):
299302
seqs = resources.get_reference_sequences(assembly, [chrom])
300303
seq = seqs[chrom]
301304

302-
contig = f'##contig=<ID={seq.ID},URL={seq.url},length={seq.length},assembly={seq.build},md5={seq.md5},species="{seq.species}">\n'
305+
contig = f'##contig=<ID={self._vcf_chrom_prefix}{seq.ID},URL={seq.url},length={seq.length},assembly={seq.build},md5={seq.md5},species="{seq.species}">'
303306

304307
if self._vcf_qc_only and cluster:
305308
# drop low quality SNPs if SNPs object maps to a cluster
@@ -371,12 +374,27 @@ def _create_vcf_representation(self, task):
371374
temp["REF"], temp["genotype"]
372375
)
373376

377+
# Populate INFO field
378+
df["INFO"] = df["ALT"].apply(self._compute_info)
379+
374380
temp = df.loc[df["genotype"].notnull()]
375381

376382
df.loc[df["genotype"].notnull(), "SAMPLE"] = np.vectorize(
377383
self._compute_genotype
378384
)(temp["REF"], temp["ALT"], temp["genotype"])
379385

386+
if sex == "Female":
387+
haploid_chroms = ["Y", "MT"]
388+
else:
389+
haploid_chroms = ["X", "Y", "MT"]
390+
391+
# populate null values for haploid chromosomes
392+
df.loc[
393+
(df["SAMPLE"].isnull())
394+
& (df["CHROM"].str.contains("|".join(haploid_chroms))),
395+
"SAMPLE",
396+
] = "."
397+
380398
df.loc[df["SAMPLE"].isnull(), "SAMPLE"] = "./."
381399

382400
del df["genotype"]
@@ -387,9 +405,18 @@ def _create_vcf_representation(self, task):
387405
"discrepant_vcf_position": discrepant_vcf_position,
388406
}
389407

408+
def _replace_genotype_indels(self, genotype):
409+
# Replace 'I' and 'D' with '<INS>' and '<DEL>'
410+
return [
411+
"<INS>" if allele == "I" else "<DEL>" if allele == "D" else allele
412+
for allele in genotype
413+
]
414+
390415
def _compute_alt(self, ref, genotype):
391416
genotype_alleles = list(set(genotype))
392417

418+
genotype_alleles = self._replace_genotype_indels(genotype_alleles)
419+
393420
if ref in genotype_alleles:
394421
if len(genotype_alleles) == 1:
395422
return self._vcf_alt_unavailable
@@ -401,6 +428,10 @@ def _compute_alt(self, ref, genotype):
401428
return ",".join(genotype_alleles)
402429

403430
def _compute_genotype(self, ref, alt, genotype):
431+
genotype = list(genotype)
432+
433+
genotype = self._replace_genotype_indels(genotype)
434+
404435
alleles = [ref]
405436

406437
if self._snps.phased:
@@ -417,3 +448,22 @@ def _compute_genotype(self, ref, alt, genotype):
417448
)
418449
else:
419450
return f"{alleles.index(genotype[0])}"
451+
452+
def _compute_info(self, alt):
453+
"""Generate the INFO field based on ALT values."""
454+
if pd.isna(alt):
455+
return "."
456+
457+
alt_values = alt.split(",")
458+
svtypes = []
459+
for alt_value in alt_values:
460+
if alt_value == "<INS>":
461+
svtypes.append("INS")
462+
elif alt_value == "<DEL>":
463+
svtypes.append("DEL")
464+
465+
if not svtypes:
466+
return "."
467+
468+
svtype_str = ",".join(svtypes)
469+
return f"SVTYPE={svtype_str};IMPRECISE" if svtype_str else "."

0 commit comments

Comments
 (0)