Skip to content

Commit

Permalink
Precompute string lengths sgkit-dev#12
Browse files Browse the repository at this point in the history
  • Loading branch information
eric-czech committed Aug 4, 2020
1 parent 5ef6aac commit 91e8912
Showing 1 changed file with 22 additions and 6 deletions.
28 changes: 22 additions & 6 deletions sgkit_plink/pysnptools.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""PLINK 1.9 reader implementation"""
from pathlib import Path
from typing import Optional, Union
from typing import Optional, Union, Mapping, Any

import dask.array as da
import dask.dataframe as dd
import numpy as np
from dask.dataframe import DataFrame
from dask.array import Array
from pysnptools.snpreader import Bed
from xarray import Dataset

Expand Down Expand Up @@ -92,11 +93,26 @@ def close(self):
self.bed._close_bed() # pragma: no cover


def _to_dict(df, dtype=None):
return {
c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype)
for c in df
}
def _max_str_len(arr: Array) -> Array:
return arr.map_blocks(
lambda s: np.char.str_len(s.astype(str)), dtype=np.int8
).max()


def _to_dict(df: DataFrame, dtype: Mapping[str, Any]=None):
arrs = {}
for c in df:
a = df[c].to_dask_array(lengths=True)
dt = df[c].dtype
if dtype:
dt = dtype[c]
kind = np.dtype(dt).kind
if kind in ['U', 'S']:
# Compute fixed-length string dtype for array
max_len = _max_str_len(a).compute()
dt = f"{kind}{max_len}"
arrs[c] = a.astype(dt)
return arrs


def read_fam(path: PathType, sep: str = " ") -> DataFrame:
Expand Down

0 comments on commit 91e8912

Please sign in to comment.