@@ -1119,29 +1119,35 @@ def _parse_vcf(self, buffer, rsids):
1119
1119
if len (alt .split ("," )) > 1 and alt .split ("," )[1 ] == "<NON_REF>" :
1120
1120
alt = alt .split ("," )[0 ]
1121
1121
1122
- zygote = line_split [9 ]
1123
- zygote = zygote .split (":" )[0 ]
1124
-
1125
1122
ref_alt = [ref ] + alt .split ("," )
1126
1123
1127
1124
# skip insertions and deletions
1128
1125
if sum (map (len , ref_alt )) > len (ref_alt ):
1129
1126
continue
1130
1127
1131
- zygote1 , zygote2 = zygote .replace ("|" , " " ).replace ("/" , " " ).split (" " )
1132
- if zygote1 == "." or zygote2 == "." :
1133
- # assign null genotypes if either allele is None
1134
- genotype = np .nan
1135
- elif (zygote1 == "0" or zygote2 == "0" ) and ref == "." :
1136
- # sample allele specifies REF allele, which is None
1137
- genotype = np .nan
1138
- elif (zygote1 == "1" or zygote2 == "1" ) and alt == "." :
1139
- # sample allele specifies ALT allele, which is None
1140
- genotype = np .nan
1141
- else :
1142
- # Could capture full genotype, if REF is None, but genotype is 1/1 or
1143
- # if ALT is None, but genotype is 0/0
1144
- genotype = ref_alt [int (zygote1 )] + ref_alt [int (zygote2 )]
1128
+ # GT (genotype) is usually the first sample-specific field
1129
+ # | = diploid phased
1130
+ # / = diploid unphased
1131
+ # or haploid e.g. male sex chromosome
1132
+ genotype = ""
1133
+ zygote = line_split [9 ]
1134
+ zygote = zygote .split (":" )[0 ]
1135
+ for z in zygote .replace ("|" , "/" ).split ("/" ):
1136
+ if z == "." :
1137
+ # missing genotype
1138
+ genotype = np .nan
1139
+ break
1140
+ z = int (z )
1141
+ if z > len (ref_alt ):
1142
+ # invalid genotype number
1143
+ genotype = np .nan
1144
+ break
1145
+ elif ref_alt [z ] == "." :
1146
+ # missing genotype in ref or alt
1147
+ genotype = np .nan
1148
+ break
1149
+ else :
1150
+ genotype = genotype + ref_alt [z ]
1145
1151
1146
1152
if "/" in zygote and pd .notna (genotype ):
1147
1153
phased = False
0 commit comments