Skip to content

Commit

Permalink
Precompute string lengths sgkit-dev#12
Browse files Browse the repository at this point in the history
  • Loading branch information
eric-czech committed Aug 5, 2020
1 parent 5ef6aac commit 0bb862d
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 83 deletions.
28 changes: 22 additions & 6 deletions sgkit_plink/pysnptools.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""PLINK 1.9 reader implementation"""
from pathlib import Path
from typing import Optional, Union
from typing import Optional, Union, Mapping, Any

import dask.array as da
import dask.dataframe as dd
import numpy as np
from dask.dataframe import DataFrame
from dask.array import Array
from pysnptools.snpreader import Bed
from xarray import Dataset

Expand Down Expand Up @@ -92,11 +93,26 @@ def close(self):
self.bed._close_bed() # pragma: no cover


def _to_dict(df, dtype=None):
return {
c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype)
for c in df
}
def _max_str_len(arr: Array) -> Array:
return arr.map_blocks(
lambda s: np.char.str_len(s.astype(str)), dtype=np.int8
).max()


def _to_dict(df: DataFrame, dtype: Mapping[str, Any]=None):
arrs = {}
for c in df:
a = df[c].to_dask_array(lengths=True)
dt = df[c].dtype
if dtype:
dt = dtype[c]
kind = np.dtype(dt).kind
if kind in ['U', 'S']:
# Compute fixed-length string dtype for array
max_len = _max_str_len(a).compute()
dt = f"{kind}{max_len}"
arrs[c] = a.astype(dt)
return arrs


def read_fam(path: PathType, sep: str = " ") -> DataFrame:
Expand Down
134 changes: 67 additions & 67 deletions sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim
Original file line number Diff line number Diff line change
@@ -1,100 +1,100 @@
1 1:1:A:C 0.0 1 C A
1 1:2:A:C 0.0 2 C A
1 1:3:A:C 0.0 3 C A
1 1:4:A:C 0.0 4 C A
1 1:5:A:C 0.0 5 C A
1 1:6:A:C 0.0 6 C A
1 1:7:A:C 0.0 7 C A
1 1:8:A:C 0.0 8 C A
1 1:9:A:C 0.0 9 C A
1 1:1:G:CGCGCG 0.0 1 CGCGCG G
1 1:2:ACT:G 0.0 2 G ACT
1 1:3:ACT:G 0.0 3 G ACT
1 1:4:G:CGCGCG 0.0 4 CGCGCG G
1 1:5:G:CGCGCG 0.0 5 CGCGCG G
1 1:6:ACT:G 0.0 6 G ACT
1 1:7:G:CGCGCG 0.0 7 CGCGCG G
1 1:8:T:GTGG 0.0 8 GTGG T
1 1:9:T:GTGG 0.0 9 GTGG T
1 1:10:A:C 0.0 10 C A
1 1:11:A:C 0.0 11 C A
1 1:12:A:C 0.0 12 C A
1 1:13:A:C 0.0 13 C A
1 1:14:A:C 0.0 14 C A
1 1:15:A:C 0.0 15 C A
1 1:11:ACT:G 0.0 11 G ACT
1 1:12:G:CGCGCG 0.0 12 CGCGCG G
1 1:13:G:CGCGCG 0.0 13 CGCGCG G
1 1:14:T:GTGG 0.0 14 GTGG T
1 1:15:ACT:G 0.0 15 G ACT
1 1:16:A:C 0.0 16 C A
1 1:17:A:C 0.0 17 C A
1 1:18:A:C 0.0 18 C A
1 1:17:ACT:G 0.0 17 G ACT
1 1:18:T:GTGG 0.0 18 GTGG T
1 1:19:A:C 0.0 19 C A
1 1:20:A:C 0.0 20 C A
1 1:21:A:C 0.0 21 C A
1 1:22:A:C 0.0 22 C A
1 1:23:A:C 0.0 23 C A
1 1:21:T:GTGG 0.0 21 GTGG T
1 1:22:G:CGCGCG 0.0 22 CGCGCG G
1 1:23:T:GTGG 0.0 23 GTGG T
1 1:24:A:C 0.0 24 C A
1 1:25:A:C 0.0 25 C A
1 1:26:A:C 0.0 26 C A
1 1:27:A:C 0.0 27 C A
1 1:28:A:C 0.0 28 C A
1 1:29:A:C 0.0 29 C A
1 1:26:ACT:G 0.0 26 G ACT
1 1:27:G:CGCGCG 0.0 27 CGCGCG G
1 1:28:ACT:G 0.0 28 G ACT
1 1:29:T:GTGG 0.0 29 GTGG T
1 1:30:A:C 0.0 30 C A
1 1:31:A:C 0.0 31 C A
1 1:32:A:C 0.0 32 C A
1 1:33:A:C 0.0 33 C A
1 1:34:A:C 0.0 34 C A
1 1:31:T:GTGG 0.0 31 GTGG T
1 1:32:G:CGCGCG 0.0 32 CGCGCG G
1 1:33:ACT:G 0.0 33 G ACT
1 1:34:G:CGCGCG 0.0 34 CGCGCG G
1 1:35:A:C 0.0 35 C A
1 1:36:A:C 0.0 36 C A
1 1:37:A:C 0.0 37 C A
1 1:36:G:CGCGCG 0.0 36 CGCGCG G
1 1:37:T:GTGG 0.0 37 GTGG T
1 1:38:A:C 0.0 38 C A
1 1:39:A:C 0.0 39 C A
1 1:40:A:C 0.0 40 C A
1 1:40:T:GTGG 0.0 40 GTGG T
1 1:41:A:C 0.0 41 C A
1 1:42:A:C 0.0 42 C A
1 1:43:A:C 0.0 43 C A
1 1:44:A:C 0.0 44 C A
1 1:45:A:C 0.0 45 C A
1 1:46:A:C 0.0 46 C A
1 1:47:A:C 0.0 47 C A
1 1:42:G:CGCGCG 0.0 42 CGCGCG G
1 1:43:T:GTGG 0.0 43 GTGG T
1 1:44:ACT:G 0.0 44 G ACT
1 1:45:G:CGCGCG 0.0 45 CGCGCG G
1 1:46:ACT:G 0.0 46 G ACT
1 1:47:G:CGCGCG 0.0 47 CGCGCG G
1 1:48:A:C 0.0 48 C A
1 1:49:A:C 0.0 49 C A
1 1:50:A:C 0.0 50 C A
1 1:51:A:C 0.0 51 C A
1 1:51:G:CGCGCG 0.0 51 CGCGCG G
1 1:52:A:C 0.0 52 C A
1 1:53:A:C 0.0 53 C A
1 1:53:ACT:G 0.0 53 G ACT
1 1:54:A:C 0.0 54 C A
1 1:55:A:C 0.0 55 C A
1 1:56:A:C 0.0 56 C A
1 1:57:A:C 0.0 57 C A
1 1:55:G:CGCGCG 0.0 55 CGCGCG G
1 1:56:T:GTGG 0.0 56 GTGG T
1 1:57:G:CGCGCG 0.0 57 CGCGCG G
1 1:58:A:C 0.0 58 C A
1 1:59:A:C 0.0 59 C A
1 1:60:A:C 0.0 60 C A
1 1:61:A:C 0.0 61 C A
1 1:59:T:GTGG 0.0 59 GTGG T
1 1:60:G:CGCGCG 0.0 60 CGCGCG G
1 1:61:ACT:G 0.0 61 G ACT
1 1:62:A:C 0.0 62 C A
1 1:63:A:C 0.0 63 C A
1 1:64:A:C 0.0 64 C A
1 1:65:A:C 0.0 65 C A
1 1:66:A:C 0.0 66 C A
1 1:67:A:C 0.0 67 C A
1 1:68:A:C 0.0 68 C A
1 1:69:A:C 0.0 69 C A
1 1:70:A:C 0.0 70 C A
1 1:71:A:C 0.0 71 C A
1 1:72:A:C 0.0 72 C A
1 1:63:G:CGCGCG 0.0 63 CGCGCG G
1 1:64:T:GTGG 0.0 64 GTGG T
1 1:65:T:GTGG 0.0 65 GTGG T
1 1:66:ACT:G 0.0 66 G ACT
1 1:67:T:GTGG 0.0 67 GTGG T
1 1:68:ACT:G 0.0 68 G ACT
1 1:69:G:CGCGCG 0.0 69 CGCGCG G
1 1:70:G:CGCGCG 0.0 70 CGCGCG G
1 1:71:ACT:G 0.0 71 G ACT
1 1:72:G:CGCGCG 0.0 72 CGCGCG G
1 1:73:A:C 0.0 73 C A
1 1:74:A:C 0.0 74 C A
1 1:75:A:C 0.0 75 C A
1 1:75:T:GTGG 0.0 75 GTGG T
1 1:76:A:C 0.0 76 C A
1 1:77:A:C 0.0 77 C A
1 1:78:A:C 0.0 78 C A
1 1:77:ACT:G 0.0 77 G ACT
1 1:78:ACT:G 0.0 78 G ACT
1 1:79:A:C 0.0 79 C A
1 1:80:A:C 0.0 80 C A
1 1:81:A:C 0.0 81 C A
1 1:82:A:C 0.0 82 C A
1 1:82:T:GTGG 0.0 82 GTGG T
1 1:83:A:C 0.0 83 C A
1 1:84:A:C 0.0 84 C A
1 1:84:ACT:G 0.0 84 G ACT
1 1:85:A:C 0.0 85 C A
1 1:86:A:C 0.0 86 C A
1 1:87:A:C 0.0 87 C A
1 1:86:G:CGCGCG 0.0 86 CGCGCG G
1 1:87:ACT:G 0.0 87 G ACT
1 1:88:A:C 0.0 88 C A
1 1:89:A:C 0.0 89 C A
1 1:90:A:C 0.0 90 C A
1 1:91:A:C 0.0 91 C A
1 1:92:A:C 0.0 92 C A
1 1:90:T:GTGG 0.0 90 GTGG T
1 1:91:T:GTGG 0.0 91 GTGG T
1 1:92:T:GTGG 0.0 92 GTGG T
1 1:93:A:C 0.0 93 C A
1 1:94:A:C 0.0 94 C A
1 1:95:A:C 0.0 95 C A
1 1:96:A:C 0.0 96 C A
1 1:97:A:C 0.0 97 C A
1 1:98:A:C 0.0 98 C A
1 1:99:A:C 0.0 99 C A
1 1:97:T:GTGG 0.0 97 GTGG T
1 1:98:ACT:G 0.0 98 G ACT
1 1:99:T:GTGG 0.0 99 GTGG T
1 1:100:A:C 0.0 100 C A
20 changes: 10 additions & 10 deletions sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
0 0 0 0 0 NA
0 1 0 0 0 NA
0 2 0 0 0 NA
0 3 0 0 0 NA
0 4 0 0 0 NA
0 5 0 0 0 NA
0 6 0 0 0 NA
0 7 0 0 0 NA
0 8 0 0 0 NA
0 9 0 0 0 NA
0 000 0 0 0 NA
0 001 0 0 0 NA
0 002 0 0 0 NA
0 003 0 0 0 NA
0 004 0 0 0 NA
0 005 0 0 0 NA
0 006 0 0 0 NA
0 007 0 0 0 NA
0 008 0 0 0 NA
0 009 0 0 0 NA
11 changes: 11 additions & 0 deletions sgkit_plink/tests/test_pysnptools.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,17 @@ def test_raise_on_both_path_types():
read_plink(path="x", bed_path="x")


def test_fixlen_str_variable(ds1):
assert ds1["sample_id"].dtype == np.dtype("<U3")
assert ds1["variant_id"].dtype == np.dtype("<U13")
assert ds1["variant_allele"].dtype == np.dtype("|S6")
assert ds1["sample_family_id"].dtype == np.dtype("<U1")
# TODO: Remove 'None' strings https://github.com/pystatgen/sgkit-plink/issues/16
# which should make these <U1
assert ds1["sample_maternal_id"].dtype == np.dtype("<U4")
assert ds1["sample_paternal_id"].dtype == np.dtype("<U4")


def test_read_slicing(ds1):
gt = ds1["call_genotype"]
shape = gt.shape
Expand Down

0 comments on commit 0bb862d

Please sign in to comment.