diff --git a/sgkit_plink/pysnptools.py b/sgkit_plink/pysnptools.py index e59bdd4..2d21147 100644 --- a/sgkit_plink/pysnptools.py +++ b/sgkit_plink/pysnptools.py @@ -1,11 +1,12 @@ """PLINK 1.9 reader implementation""" from pathlib import Path -from typing import Optional, Union +from typing import Optional, Union, Mapping, Any import dask.array as da import dask.dataframe as dd import numpy as np from dask.dataframe import DataFrame +from dask.array import Array from pysnptools.snpreader import Bed from xarray import Dataset @@ -92,11 +93,26 @@ def close(self): self.bed._close_bed() # pragma: no cover -def _to_dict(df, dtype=None): - return { - c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype) - for c in df - } +def _max_str_len(arr: Array) -> Array: + return arr.map_blocks( + lambda s: np.char.str_len(s.astype(str)), dtype=np.int8 + ).max() + + +def _to_dict(df: DataFrame, dtype: Mapping[str, Any]=None): + arrs = {} + for c in df: + a = df[c].to_dask_array(lengths=True) + dt = df[c].dtype + if dtype: + dt = dtype[c] + kind = np.dtype(dt).kind + if kind in ['U', 'S']: + # Compute fixed-length string dtype for array + max_len = _max_str_len(a).compute() + dt = f"{kind}{max_len}" + arrs[c] = a.astype(dt) + return arrs def read_fam(path: PathType, sep: str = " ") -> DataFrame: diff --git a/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim index 01325a9..3e668d5 100644 --- a/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim +++ b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim @@ -1,100 +1,100 @@ -1 1:1:A:C 0.0 1 C A -1 1:2:A:C 0.0 2 C A -1 1:3:A:C 0.0 3 C A -1 1:4:A:C 0.0 4 C A -1 1:5:A:C 0.0 5 C A -1 1:6:A:C 0.0 6 C A -1 1:7:A:C 0.0 7 C A -1 1:8:A:C 0.0 8 C A -1 1:9:A:C 0.0 9 C A +1 1:1:G:CGCGCG 0.0 1 CGCGCG G +1 1:2:ACT:G 0.0 2 G ACT +1 1:3:ACT:G 0.0 3 G ACT +1 1:4:G:CGCGCG 0.0 4 CGCGCG G +1 1:5:G:CGCGCG 0.0 5 CGCGCG G +1 1:6:ACT:G 0.0 6 G ACT +1 1:7:G:CGCGCG 0.0 7 CGCGCG G +1 1:8:T:GTGG 0.0 8 GTGG T +1 1:9:T:GTGG 0.0 9 GTGG T 1 1:10:A:C 0.0 10 C A -1 1:11:A:C 0.0 11 C A -1 1:12:A:C 0.0 12 C A -1 1:13:A:C 0.0 13 C A -1 1:14:A:C 0.0 14 C A -1 1:15:A:C 0.0 15 C A +1 1:11:ACT:G 0.0 11 G ACT +1 1:12:G:CGCGCG 0.0 12 CGCGCG G +1 1:13:G:CGCGCG 0.0 13 CGCGCG G +1 1:14:T:GTGG 0.0 14 GTGG T +1 1:15:ACT:G 0.0 15 G ACT 1 1:16:A:C 0.0 16 C A -1 1:17:A:C 0.0 17 C A -1 1:18:A:C 0.0 18 C A +1 1:17:ACT:G 0.0 17 G ACT +1 1:18:T:GTGG 0.0 18 GTGG T 1 1:19:A:C 0.0 19 C A 1 1:20:A:C 0.0 20 C A -1 1:21:A:C 0.0 21 C A -1 1:22:A:C 0.0 22 C A -1 1:23:A:C 0.0 23 C A +1 1:21:T:GTGG 0.0 21 GTGG T +1 1:22:G:CGCGCG 0.0 22 CGCGCG G +1 1:23:T:GTGG 0.0 23 GTGG T 1 1:24:A:C 0.0 24 C A 1 1:25:A:C 0.0 25 C A -1 1:26:A:C 0.0 26 C A -1 1:27:A:C 0.0 27 C A -1 1:28:A:C 0.0 28 C A -1 1:29:A:C 0.0 29 C A +1 1:26:ACT:G 0.0 26 G ACT +1 1:27:G:CGCGCG 0.0 27 CGCGCG G +1 1:28:ACT:G 0.0 28 G ACT +1 1:29:T:GTGG 0.0 29 GTGG T 1 1:30:A:C 0.0 30 C A -1 1:31:A:C 0.0 31 C A -1 1:32:A:C 0.0 32 C A -1 1:33:A:C 0.0 33 C A -1 1:34:A:C 0.0 34 C A +1 1:31:T:GTGG 0.0 31 GTGG T +1 1:32:G:CGCGCG 0.0 32 CGCGCG G +1 1:33:ACT:G 0.0 33 G ACT +1 1:34:G:CGCGCG 0.0 34 CGCGCG G 1 1:35:A:C 0.0 35 C A -1 1:36:A:C 0.0 36 C A -1 1:37:A:C 0.0 37 C A +1 1:36:G:CGCGCG 0.0 36 CGCGCG G +1 1:37:T:GTGG 0.0 37 GTGG T 1 1:38:A:C 0.0 38 C A 1 1:39:A:C 0.0 39 C A -1 1:40:A:C 0.0 40 C A +1 1:40:T:GTGG 0.0 40 GTGG T 1 1:41:A:C 0.0 41 C A -1 1:42:A:C 0.0 42 C A -1 1:43:A:C 0.0 43 C A -1 1:44:A:C 0.0 44 C A -1 1:45:A:C 0.0 45 C A -1 1:46:A:C 0.0 46 C A -1 1:47:A:C 0.0 47 C A +1 1:42:G:CGCGCG 0.0 42 CGCGCG G +1 1:43:T:GTGG 0.0 43 GTGG T +1 1:44:ACT:G 0.0 44 G ACT +1 1:45:G:CGCGCG 0.0 45 CGCGCG G +1 1:46:ACT:G 0.0 46 G ACT +1 1:47:G:CGCGCG 0.0 47 CGCGCG G 1 1:48:A:C 0.0 48 C A 1 1:49:A:C 0.0 49 C A 1 1:50:A:C 0.0 50 C A -1 1:51:A:C 0.0 51 C A +1 1:51:G:CGCGCG 0.0 51 CGCGCG G 1 1:52:A:C 0.0 52 C A -1 1:53:A:C 0.0 53 C A +1 1:53:ACT:G 0.0 53 G ACT 1 1:54:A:C 0.0 54 C A -1 1:55:A:C 0.0 55 C A -1 1:56:A:C 0.0 56 C A -1 1:57:A:C 0.0 57 C A +1 1:55:G:CGCGCG 0.0 55 CGCGCG G +1 1:56:T:GTGG 0.0 56 GTGG T +1 1:57:G:CGCGCG 0.0 57 CGCGCG G 1 1:58:A:C 0.0 58 C A -1 1:59:A:C 0.0 59 C A -1 1:60:A:C 0.0 60 C A -1 1:61:A:C 0.0 61 C A +1 1:59:T:GTGG 0.0 59 GTGG T +1 1:60:G:CGCGCG 0.0 60 CGCGCG G +1 1:61:ACT:G 0.0 61 G ACT 1 1:62:A:C 0.0 62 C A -1 1:63:A:C 0.0 63 C A -1 1:64:A:C 0.0 64 C A -1 1:65:A:C 0.0 65 C A -1 1:66:A:C 0.0 66 C A -1 1:67:A:C 0.0 67 C A -1 1:68:A:C 0.0 68 C A -1 1:69:A:C 0.0 69 C A -1 1:70:A:C 0.0 70 C A -1 1:71:A:C 0.0 71 C A -1 1:72:A:C 0.0 72 C A +1 1:63:G:CGCGCG 0.0 63 CGCGCG G +1 1:64:T:GTGG 0.0 64 GTGG T +1 1:65:T:GTGG 0.0 65 GTGG T +1 1:66:ACT:G 0.0 66 G ACT +1 1:67:T:GTGG 0.0 67 GTGG T +1 1:68:ACT:G 0.0 68 G ACT +1 1:69:G:CGCGCG 0.0 69 CGCGCG G +1 1:70:G:CGCGCG 0.0 70 CGCGCG G +1 1:71:ACT:G 0.0 71 G ACT +1 1:72:G:CGCGCG 0.0 72 CGCGCG G 1 1:73:A:C 0.0 73 C A 1 1:74:A:C 0.0 74 C A -1 1:75:A:C 0.0 75 C A +1 1:75:T:GTGG 0.0 75 GTGG T 1 1:76:A:C 0.0 76 C A -1 1:77:A:C 0.0 77 C A -1 1:78:A:C 0.0 78 C A +1 1:77:ACT:G 0.0 77 G ACT +1 1:78:ACT:G 0.0 78 G ACT 1 1:79:A:C 0.0 79 C A 1 1:80:A:C 0.0 80 C A 1 1:81:A:C 0.0 81 C A -1 1:82:A:C 0.0 82 C A +1 1:82:T:GTGG 0.0 82 GTGG T 1 1:83:A:C 0.0 83 C A -1 1:84:A:C 0.0 84 C A +1 1:84:ACT:G 0.0 84 G ACT 1 1:85:A:C 0.0 85 C A -1 1:86:A:C 0.0 86 C A -1 1:87:A:C 0.0 87 C A +1 1:86:G:CGCGCG 0.0 86 CGCGCG G +1 1:87:ACT:G 0.0 87 G ACT 1 1:88:A:C 0.0 88 C A 1 1:89:A:C 0.0 89 C A -1 1:90:A:C 0.0 90 C A -1 1:91:A:C 0.0 91 C A -1 1:92:A:C 0.0 92 C A +1 1:90:T:GTGG 0.0 90 GTGG T +1 1:91:T:GTGG 0.0 91 GTGG T +1 1:92:T:GTGG 0.0 92 GTGG T 1 1:93:A:C 0.0 93 C A 1 1:94:A:C 0.0 94 C A 1 1:95:A:C 0.0 95 C A 1 1:96:A:C 0.0 96 C A -1 1:97:A:C 0.0 97 C A -1 1:98:A:C 0.0 98 C A -1 1:99:A:C 0.0 99 C A +1 1:97:T:GTGG 0.0 97 GTGG T +1 1:98:ACT:G 0.0 98 G ACT +1 1:99:T:GTGG 0.0 99 GTGG T 1 1:100:A:C 0.0 100 C A diff --git a/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam index e0af7f4..36efeff 100644 --- a/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam +++ b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam @@ -1,10 +1,10 @@ -0 0 0 0 0 NA -0 1 0 0 0 NA -0 2 0 0 0 NA -0 3 0 0 0 NA -0 4 0 0 0 NA -0 5 0 0 0 NA -0 6 0 0 0 NA -0 7 0 0 0 NA -0 8 0 0 0 NA -0 9 0 0 0 NA +0 000 0 0 0 NA +0 001 0 0 0 NA +0 002 0 0 0 NA +0 003 0 0 0 NA +0 004 0 0 0 NA +0 005 0 0 0 NA +0 006 0 0 0 NA +0 007 0 0 0 NA +0 008 0 0 0 NA +0 009 0 0 0 NA diff --git a/sgkit_plink/tests/test_pysnptools.py b/sgkit_plink/tests/test_pysnptools.py index ac5df4d..5d38578 100644 --- a/sgkit_plink/tests/test_pysnptools.py +++ b/sgkit_plink/tests/test_pysnptools.py @@ -32,6 +32,17 @@ def test_raise_on_both_path_types(): read_plink(path="x", bed_path="x") +def test_fixlen_str_variable(ds1): + assert ds1["sample_id"].dtype == np.dtype("