Skip to content
This repository has been archived by the owner on Oct 15, 2020. It is now read-only.

Commit

Permalink
Merge pull request #20 from eric-czech/str_len
Browse files Browse the repository at this point in the history
Precompute fixed-length string dtypes
  • Loading branch information
eric-czech authored Aug 24, 2020
2 parents c4e5c38 + 467d4bd commit f62fc70
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 83 deletions.
28 changes: 22 additions & 6 deletions sgkit_plink/pysnptools.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""PLINK 1.9 reader implementation"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, List, Mapping, Optional, Tuple, Union

import dask.array as da
import dask.dataframe as dd
import numpy as np
from dask.array import Array
from dask.dataframe import DataFrame
from pysnptools.snpreader import Bed
from xarray import Dataset
Expand Down Expand Up @@ -98,11 +99,26 @@ def close(self) -> None:
self.bed._close_bed() # pragma: no cover


def _to_dict(df: dd.DataFrame, dtype: Any = None) -> Dict[str, da.Array]:
return {
c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype)
for c in df
}
def _max_str_len(arr: Array) -> Array:
return arr.map_blocks(lambda s: np.char.str_len(s.astype(str)), dtype=np.int8).max()


def _to_dict(
df: DataFrame, dtype: Optional[Mapping[str, Any]] = None
) -> Mapping[str, Any]:
arrs = {}
for c in df:
a = df[c].to_dask_array(lengths=True)
dt = df[c].dtype
if dtype:
dt = dtype[c]
kind = np.dtype(dt).kind
if kind in ["U", "S"]:
# Compute fixed-length string dtype for array
max_len = _max_str_len(a).compute()
dt = f"{kind}{max_len}"
arrs[c] = a.astype(dt)
return arrs


def read_fam(path: PathType, sep: str = " ") -> DataFrame:
Expand Down
134 changes: 67 additions & 67 deletions sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim
Original file line number Diff line number Diff line change
@@ -1,100 +1,100 @@
1 1:1:A:C 0.0 1 C A
1 1:2:A:C 0.0 2 C A
1 1:3:A:C 0.0 3 C A
1 1:4:A:C 0.0 4 C A
1 1:5:A:C 0.0 5 C A
1 1:6:A:C 0.0 6 C A
1 1:7:A:C 0.0 7 C A
1 1:8:A:C 0.0 8 C A
1 1:9:A:C 0.0 9 C A
1 1:1:G:CGCGCG 0.0 1 CGCGCG G
1 1:2:ACT:G 0.0 2 G ACT
1 1:3:ACT:G 0.0 3 G ACT
1 1:4:G:CGCGCG 0.0 4 CGCGCG G
1 1:5:G:CGCGCG 0.0 5 CGCGCG G
1 1:6:ACT:G 0.0 6 G ACT
1 1:7:G:CGCGCG 0.0 7 CGCGCG G
1 1:8:T:GTGG 0.0 8 GTGG T
1 1:9:T:GTGG 0.0 9 GTGG T
1 1:10:A:C 0.0 10 C A
1 1:11:A:C 0.0 11 C A
1 1:12:A:C 0.0 12 C A
1 1:13:A:C 0.0 13 C A
1 1:14:A:C 0.0 14 C A
1 1:15:A:C 0.0 15 C A
1 1:11:ACT:G 0.0 11 G ACT
1 1:12:G:CGCGCG 0.0 12 CGCGCG G
1 1:13:G:CGCGCG 0.0 13 CGCGCG G
1 1:14:T:GTGG 0.0 14 GTGG T
1 1:15:ACT:G 0.0 15 G ACT
1 1:16:A:C 0.0 16 C A
1 1:17:A:C 0.0 17 C A
1 1:18:A:C 0.0 18 C A
1 1:17:ACT:G 0.0 17 G ACT
1 1:18:T:GTGG 0.0 18 GTGG T
1 1:19:A:C 0.0 19 C A
1 1:20:A:C 0.0 20 C A
1 1:21:A:C 0.0 21 C A
1 1:22:A:C 0.0 22 C A
1 1:23:A:C 0.0 23 C A
1 1:21:T:GTGG 0.0 21 GTGG T
1 1:22:G:CGCGCG 0.0 22 CGCGCG G
1 1:23:T:GTGG 0.0 23 GTGG T
1 1:24:A:C 0.0 24 C A
1 1:25:A:C 0.0 25 C A
1 1:26:A:C 0.0 26 C A
1 1:27:A:C 0.0 27 C A
1 1:28:A:C 0.0 28 C A
1 1:29:A:C 0.0 29 C A
1 1:26:ACT:G 0.0 26 G ACT
1 1:27:G:CGCGCG 0.0 27 CGCGCG G
1 1:28:ACT:G 0.0 28 G ACT
1 1:29:T:GTGG 0.0 29 GTGG T
1 1:30:A:C 0.0 30 C A
1 1:31:A:C 0.0 31 C A
1 1:32:A:C 0.0 32 C A
1 1:33:A:C 0.0 33 C A
1 1:34:A:C 0.0 34 C A
1 1:31:T:GTGG 0.0 31 GTGG T
1 1:32:G:CGCGCG 0.0 32 CGCGCG G
1 1:33:ACT:G 0.0 33 G ACT
1 1:34:G:CGCGCG 0.0 34 CGCGCG G
1 1:35:A:C 0.0 35 C A
1 1:36:A:C 0.0 36 C A
1 1:37:A:C 0.0 37 C A
1 1:36:G:CGCGCG 0.0 36 CGCGCG G
1 1:37:T:GTGG 0.0 37 GTGG T
1 1:38:A:C 0.0 38 C A
1 1:39:A:C 0.0 39 C A
1 1:40:A:C 0.0 40 C A
1 1:40:T:GTGG 0.0 40 GTGG T
1 1:41:A:C 0.0 41 C A
1 1:42:A:C 0.0 42 C A
1 1:43:A:C 0.0 43 C A
1 1:44:A:C 0.0 44 C A
1 1:45:A:C 0.0 45 C A
1 1:46:A:C 0.0 46 C A
1 1:47:A:C 0.0 47 C A
1 1:42:G:CGCGCG 0.0 42 CGCGCG G
1 1:43:T:GTGG 0.0 43 GTGG T
1 1:44:ACT:G 0.0 44 G ACT
1 1:45:G:CGCGCG 0.0 45 CGCGCG G
1 1:46:ACT:G 0.0 46 G ACT
1 1:47:G:CGCGCG 0.0 47 CGCGCG G
1 1:48:A:C 0.0 48 C A
1 1:49:A:C 0.0 49 C A
1 1:50:A:C 0.0 50 C A
1 1:51:A:C 0.0 51 C A
1 1:51:G:CGCGCG 0.0 51 CGCGCG G
1 1:52:A:C 0.0 52 C A
1 1:53:A:C 0.0 53 C A
1 1:53:ACT:G 0.0 53 G ACT
1 1:54:A:C 0.0 54 C A
1 1:55:A:C 0.0 55 C A
1 1:56:A:C 0.0 56 C A
1 1:57:A:C 0.0 57 C A
1 1:55:G:CGCGCG 0.0 55 CGCGCG G
1 1:56:T:GTGG 0.0 56 GTGG T
1 1:57:G:CGCGCG 0.0 57 CGCGCG G
1 1:58:A:C 0.0 58 C A
1 1:59:A:C 0.0 59 C A
1 1:60:A:C 0.0 60 C A
1 1:61:A:C 0.0 61 C A
1 1:59:T:GTGG 0.0 59 GTGG T
1 1:60:G:CGCGCG 0.0 60 CGCGCG G
1 1:61:ACT:G 0.0 61 G ACT
1 1:62:A:C 0.0 62 C A
1 1:63:A:C 0.0 63 C A
1 1:64:A:C 0.0 64 C A
1 1:65:A:C 0.0 65 C A
1 1:66:A:C 0.0 66 C A
1 1:67:A:C 0.0 67 C A
1 1:68:A:C 0.0 68 C A
1 1:69:A:C 0.0 69 C A
1 1:70:A:C 0.0 70 C A
1 1:71:A:C 0.0 71 C A
1 1:72:A:C 0.0 72 C A
1 1:63:G:CGCGCG 0.0 63 CGCGCG G
1 1:64:T:GTGG 0.0 64 GTGG T
1 1:65:T:GTGG 0.0 65 GTGG T
1 1:66:ACT:G 0.0 66 G ACT
1 1:67:T:GTGG 0.0 67 GTGG T
1 1:68:ACT:G 0.0 68 G ACT
1 1:69:G:CGCGCG 0.0 69 CGCGCG G
1 1:70:G:CGCGCG 0.0 70 CGCGCG G
1 1:71:ACT:G 0.0 71 G ACT
1 1:72:G:CGCGCG 0.0 72 CGCGCG G
1 1:73:A:C 0.0 73 C A
1 1:74:A:C 0.0 74 C A
1 1:75:A:C 0.0 75 C A
1 1:75:T:GTGG 0.0 75 GTGG T
1 1:76:A:C 0.0 76 C A
1 1:77:A:C 0.0 77 C A
1 1:78:A:C 0.0 78 C A
1 1:77:ACT:G 0.0 77 G ACT
1 1:78:ACT:G 0.0 78 G ACT
1 1:79:A:C 0.0 79 C A
1 1:80:A:C 0.0 80 C A
1 1:81:A:C 0.0 81 C A
1 1:82:A:C 0.0 82 C A
1 1:82:T:GTGG 0.0 82 GTGG T
1 1:83:A:C 0.0 83 C A
1 1:84:A:C 0.0 84 C A
1 1:84:ACT:G 0.0 84 G ACT
1 1:85:A:C 0.0 85 C A
1 1:86:A:C 0.0 86 C A
1 1:87:A:C 0.0 87 C A
1 1:86:G:CGCGCG 0.0 86 CGCGCG G
1 1:87:ACT:G 0.0 87 G ACT
1 1:88:A:C 0.0 88 C A
1 1:89:A:C 0.0 89 C A
1 1:90:A:C 0.0 90 C A
1 1:91:A:C 0.0 91 C A
1 1:92:A:C 0.0 92 C A
1 1:90:T:GTGG 0.0 90 GTGG T
1 1:91:T:GTGG 0.0 91 GTGG T
1 1:92:T:GTGG 0.0 92 GTGG T
1 1:93:A:C 0.0 93 C A
1 1:94:A:C 0.0 94 C A
1 1:95:A:C 0.0 95 C A
1 1:96:A:C 0.0 96 C A
1 1:97:A:C 0.0 97 C A
1 1:98:A:C 0.0 98 C A
1 1:99:A:C 0.0 99 C A
1 1:97:T:GTGG 0.0 97 GTGG T
1 1:98:ACT:G 0.0 98 G ACT
1 1:99:T:GTGG 0.0 99 GTGG T
1 1:100:A:C 0.0 100 C A
20 changes: 10 additions & 10 deletions sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
0 0 0 0 0 NA
0 1 0 0 0 NA
0 2 0 0 0 NA
0 3 0 0 0 NA
0 4 0 0 0 NA
0 5 0 0 0 NA
0 6 0 0 0 NA
0 7 0 0 0 NA
0 8 0 0 0 NA
0 9 0 0 0 NA
0 000 0 0 0 NA
0 001 0 0 0 NA
0 002 0 0 0 NA
0 003 0 0 0 NA
0 004 0 0 0 NA
0 005 0 0 0 NA
0 006 0 0 0 NA
0 007 0 0 0 NA
0 008 0 0 0 NA
0 009 0 0 0 NA
16 changes: 16 additions & 0 deletions sgkit_plink/tests/test_pysnptools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
import xarray as xr
from sgkit_plink.pysnptools import read_plink

# This data was generated externally using Hail
# for 10 samples, 100 variants, and genotype calls
# that are missing in ~10% of cases.
# TODO: document and move code to central location
# (cf. https://github.com/pystatgen/sgkit-plink/pull/20#discussion_r466907811)
example_dataset_1 = "plink_sim_10s_100v_10pmiss"


Expand Down Expand Up @@ -32,6 +37,17 @@ def test_raise_on_both_path_types():
read_plink(path="x", bed_path="x")


def test_fixlen_str_variable(ds1):
assert ds1["sample_id"].dtype == np.dtype("<U3")
assert ds1["variant_id"].dtype == np.dtype("<U13")
assert ds1["variant_allele"].dtype == np.dtype("|S6")
assert ds1["sample_family_id"].dtype == np.dtype("<U1")
# TODO: Remove 'None' strings https://github.com/pystatgen/sgkit-plink/issues/16
# which should make these <U1
assert ds1["sample_maternal_id"].dtype == np.dtype("<U4")
assert ds1["sample_paternal_id"].dtype == np.dtype("<U4")


def test_read_slicing(ds1):
gt = ds1["call_genotype"]
shape = gt.shape
Expand Down

0 comments on commit f62fc70

Please sign in to comment.