From d554a6062370abbdd9840e049f5532663d0b3238 Mon Sep 17 00:00:00 2001 From: Andrew Riha Date: Sun, 9 Sep 2018 15:32:32 -0700 Subject: [PATCH] Parse data sources when reading lineage CSV file --- lineage/individual.py | 6 +++--- lineage/snps.py | 16 ++++++++++++---- tests/test_individual.py | 22 +++++++++++++++++++--- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/lineage/individual.py b/lineage/individual.py index e7991b9..8ef20ab 100644 --- a/lineage/individual.py +++ b/lineage/individual.py @@ -371,7 +371,7 @@ def _add_snps(self, snps, discrepant_snp_positions_threshold, return discrepant_positions, discrepant_genotypes build = snps.build - source = snps.source + source = [s.strip() for s in snps.source.split(',')] if not snps.build_detected: print('build not detected, assuming build {}'.format(snps.build)) @@ -385,7 +385,7 @@ def _add_snps(self, snps, discrepant_snp_positions_threshold, snps = self._double_single_alleles(snps.snps, 'X') if self._snps is None: - self._source.append(source) + self._source.extend(source) self._snps = snps else: common_snps = self._snps.join(snps, how='inner', rsuffix='_added') @@ -439,7 +439,7 @@ def _add_snps(self, snps, discrepant_snp_positions_threshold, return discrepant_positions, discrepant_genotypes # add new SNPs - self._source.append(source) + self._source.extend(source) self._snps = self._snps.combine_first(snps) self._snps.loc[discrepant_genotypes.index, 'genotype'] = np.nan diff --git a/lineage/snps.py b/lineage/snps.py index 3cd1a6e..52bab91 100644 --- a/lineage/snps.py +++ b/lineage/snps.py @@ -135,7 +135,7 @@ def _read_raw_data(self, file): elif first_line.startswith('RSID'): return self._read_ftdna(file) elif 'lineage' in first_line: - return self._read_lineage_csv(file) + return self._read_lineage_csv(file, comments) elif first_line.startswith('rsid'): return self._read_generic_csv(file) else: @@ -256,26 +256,34 @@ def _read_ancestry(file): return sort_snps(df), 'AncestryDNA' @staticmethod - def _read_lineage_csv(file): + def _read_lineage_csv(file, comments): """ Read and parse CSV file generated by lineage. Parameters ---------- file : str path to file + comments : str + comments at beginning of file Returns ------- pandas.DataFrame individual's genetic data normalized for use with `lineage` str - name of data source + name of data source(s) """ + source = '' + for comment in comments.split('\n'): + if 'Source(s):' in comment: + source = comment.split('Source(s):')[1].strip() + break + df = pd.read_csv(file, comment='#', header=0, na_values='--', names=['rsid', 'chrom', 'pos', 'genotype'], index_col=0, dtype={'chrom': object, 'pos': np.int64}) - return sort_snps(df), 'lineage' + return sort_snps(df), source @staticmethod def _read_generic_csv(file): diff --git a/tests/test_individual.py b/tests/test_individual.py index e7a847f..2178f0f 100644 --- a/tests/test_individual.py +++ b/tests/test_individual.py @@ -120,12 +120,28 @@ def test_snps_ancestry(l, generic_snps): pd.testing.assert_frame_equal(ind.snps, generic_snps) -def test_source_lineage(l): - ind = l.create_individual('', 'tests/input/chromosomes.csv') +def test_source_lineage_file(l): + ind = l.create_individual('', 'tests/input/GRCh37.csv') assert ind.source == 'generic' + ind.load_snps('tests/input/23andme.txt') + assert ind.source == 'generic, 23andMe' file = ind.save_snps() ind_saved_snps = l.create_individual('', file) - assert ind_saved_snps.source == 'lineage' + assert ind_saved_snps.source == 'generic, 23andMe' + pd.testing.assert_frame_equal(ind.snps, ind_saved_snps.snps) + + +def test_source_lineage_file_gzip(l): + ind = l.create_individual('', 'tests/input/GRCh37.csv') + assert ind.source == 'generic' + ind.load_snps('tests/input/23andme.txt') + assert ind.source == 'generic, 23andMe' + file = ind.save_snps() + with open(file, 'rb') as f_in: + with gzip.open(file + '.gz', 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + ind_saved_snps = l.create_individual('', file + '.gz') + assert ind_saved_snps.source == 'generic, 23andMe' pd.testing.assert_frame_equal(ind.snps, ind_saved_snps.snps)