Skip to content

Commit 6ce1f8f

Browse files
Merge pull request #45 from sanogenetics/feature/23andme-missings
handle unusual 23andme files with missing values
2 parents 5b899f8 + 1d5430e commit 6ce1f8f

File tree

5 files changed

+153
-99
lines changed

5 files changed

+153
-99
lines changed

src/snps/io/reader.py

+68-16
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868

6969

7070
def get_empty_snps_dataframe():
71-
""" Get empty dataframe normalized for usage with ``snps``.
71+
"""Get empty dataframe normalized for usage with ``snps``.
7272
7373
Returns
7474
-------
@@ -81,7 +81,7 @@ def get_empty_snps_dataframe():
8181

8282

8383
class Reader:
84-
""" Class for reading and parsing raw data / genotype files. """
84+
"""Class for reading and parsing raw data / genotype files."""
8585

8686
def __init__(self, file="", only_detect_source=False, resources=None, rsids=()):
8787
"""Initialize a `Reader`.
@@ -104,7 +104,7 @@ def __init__(self, file="", only_detect_source=False, resources=None, rsids=()):
104104
self._rsids = frozenset(rsids)
105105

106106
def read(self):
107-
""" Read and parse a raw data / genotype file.
107+
"""Read and parse a raw data / genotype file.
108108
109109
Returns
110110
-------
@@ -447,18 +447,70 @@ def read_23andme(self, file, compression):
447447
"""
448448

449449
def parser():
450-
return (
451-
pd.read_csv(
452-
file,
453-
comment="#",
454-
sep="\t",
455-
na_values="--",
456-
names=["rsid", "chrom", "pos", "genotype"],
457-
index_col=0,
458-
dtype=NORMALIZED_DTYPES,
459-
compression=compression,
460-
),
450+
df = pd.read_csv(
451+
file,
452+
comment="#",
453+
sep="\t",
454+
na_values="--",
455+
names=["rsid", "chrom", "pos", "genotype"],
456+
compression=compression,
461457
)
458+
df = df.dropna(subset=["rsid", "chrom", "pos"])
459+
# turn number numbers into string numbers
460+
df["chrom"] = df["chrom"].map(
461+
{
462+
"1": "1",
463+
"2": "2",
464+
"3": "3",
465+
"4": "4",
466+
"5": "5",
467+
"6": "6",
468+
"7": "7",
469+
"8": "8",
470+
"9": "9",
471+
"10": "10",
472+
"11": "11",
473+
"12": "12",
474+
"13": "13",
475+
"14": "14",
476+
"15": "15",
477+
"16": "16",
478+
"17": "17",
479+
"18": "18",
480+
"19": "19",
481+
"20": "20",
482+
"21": "21",
483+
"22": "22",
484+
1: "1",
485+
2: "2",
486+
3: "3",
487+
4: "4",
488+
5: "5",
489+
6: "6",
490+
7: "7",
491+
8: "8",
492+
9: "9",
493+
10: "10",
494+
11: "11",
495+
12: "12",
496+
13: "13",
497+
14: "14",
498+
15: "15",
499+
16: "16",
500+
17: "17",
501+
18: "18",
502+
19: "19",
503+
20: "20",
504+
21: "21",
505+
22: "22",
506+
"X": "X",
507+
"Y": "Y",
508+
"MT": "MT",
509+
}
510+
)
511+
df = df.astype(dtype=NORMALIZED_DTYPES)
512+
df = df.set_index("rsid")
513+
return (df,)
462514

463515
return self.read_helper("23andMe", parser)
464516

@@ -725,7 +777,7 @@ def parser():
725777
return self.read_helper("LivingDNA", parser)
726778

727779
def read_mapmygenome(self, file, compression, header):
728-
""" Read and parse Mapmygenome file.
780+
"""Read and parse Mapmygenome file.
729781
730782
https://mapmygenome.in
731783
@@ -1065,7 +1117,7 @@ def parser():
10651117
return self.read_helper("DNA.Land", parser)
10661118

10671119
def read_snps_csv(self, file, comments, compression):
1068-
""" Read and parse CSV file generated by ``snps``.
1120+
"""Read and parse CSV file generated by ``snps``.
10691121
10701122
https://pypi.org/project/snps/
10711123

src/snps/io/writer.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,10 @@
4848

4949

5050
class Writer:
51-
""" Class for writing SNPs to files. """
51+
"""Class for writing SNPs to files."""
5252

5353
def __init__(self, snps=None, filename="", vcf=False, atomic=True, **kwargs):
54-
""" Initialize a `Writer`.
54+
"""Initialize a `Writer`.
5555
5656
Parameters
5757
----------
@@ -80,7 +80,7 @@ def write(self):
8080

8181
@classmethod
8282
def write_file(cls, snps=None, filename="", vcf=False, atomic=True, **kwargs):
83-
""" Save SNPs to file.
83+
"""Save SNPs to file.
8484
8585
Parameters
8686
----------
@@ -106,7 +106,7 @@ def write_file(cls, snps=None, filename="", vcf=False, atomic=True, **kwargs):
106106
return w.write()
107107

108108
def _write_csv(self):
109-
""" Write SNPs to a CSV file.
109+
"""Write SNPs to a CSV file.
110110
111111
Returns
112112
-------
@@ -147,7 +147,7 @@ def _write_csv(self):
147147
)
148148

149149
def _write_vcf(self):
150-
""" Write SNPs to a VCF file.
150+
"""Write SNPs to a VCF file.
151151
152152
References
153153
----------

0 commit comments

Comments
 (0)