Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adds script to import cctbx data to rs #264

Merged
merged 39 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
a2a2343
adds script to import cctbx data to rs
dermen Aug 10, 2024
36b5873
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 10, 2024
592007b
removes cctbx dependency
dermen Aug 11, 2024
8db20f9
merged precommit
dermen Aug 11, 2024
cd94855
merged precommit 2
dermen Aug 11, 2024
1233625
rejig to io
dermen Aug 20, 2024
fa209fe
adds mpi support
dermen Aug 20, 2024
f19bae4
addresses review
dermen Aug 25, 2024
0106d9d
removes comment
dermen Aug 25, 2024
45b8ddd
addresses review pt2
dermen Aug 26, 2024
c52719f
uses better names
dermen Aug 26, 2024
d1c4424
cleans up io __init__
dermen Aug 27, 2024
c2d84fd
adds support for more columns
dermen Aug 27, 2024
b2afcb5
adds verbose flag for read_dials_stills
dermen Aug 28, 2024
f315274
more debug statements
dermen Aug 28, 2024
45707ca
Merge remote-tracking branch 'upstream/main'
dermen Sep 3, 2024
5f850e5
unit tests for refl table reader
dermen Sep 3, 2024
b28bd46
adds back in the comma
dermen Sep 3, 2024
c71dd06
cleanup
dermen Sep 3, 2024
d30d18f
more cleanup
dermen Sep 4, 2024
a1e07bb
use tempfile, remove __main__ for production
Sep 10, 2024
37ce079
refactor, add mpi test with dummy comms
Sep 12, 2024
36c3376
Merge pull request #1 from kmdalton/stills
dermen Sep 17, 2024
53f6021
get ray_context
dermen Sep 17, 2024
493630c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 17, 2024
cb7570b
Update common.py
dermen Sep 17, 2024
479104d
Update common.py
dermen Sep 17, 2024
7f7dd26
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 17, 2024
7fd1208
allow nan inference for float64
Sep 18, 2024
a365d34
remove dtype inference from read_dials_stills
Sep 18, 2024
cc1117b
make cell/sg optional. improve docstring
Sep 18, 2024
73e1ed4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 18, 2024
8e32b56
fix docstring
Sep 19, 2024
af61760
make dtype inference optional
Sep 19, 2024
bce6f80
test dtype inference toggle
Sep 19, 2024
58b862d
test mtz_dtypes flag and mtz writing
Sep 19, 2024
1fc6f7a
no need for a list of files
Sep 19, 2024
6c248e4
separate test for mtz io
Sep 19, 2024
1d0b88d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions reciprocalspaceship/io/dials.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import glob

import gemmi
import msgpack
import numpy as np
import ray

import reciprocalspaceship as rs

MSGPACK_DTYPES = {
"double": np.float64,
"float": np.float32,
"int": np.int32,
"cctbx::miller::index<>": np.int32,
"vec3<double>": np.float64,
"std::size_t": np.intp,
}


def get_msgpack_data(data, name):
"""

Parameters
----------
data: msgpack data dict
name: msgpack data key

Returns
-------
numpy array of values
"""
dtype, (num, buff) = data[name]
if dtype in MSGPACK_DTYPES:
dtype = MSGPACK_DTYPES[dtype]
else:
dtype = None
vals = np.frombuffer(buff, dtype).reshape((num, -1))
return vals.T


def get_fnames(dirnames):
fnames = []
for dirname in dirnames:
fnames += glob.glob(dirname + "/*integrated.refl")
print("Found %d files" % len(fnames))
return fnames


def _concat(refl_data):
refl_data = [ds for ds in refl_data if ds is not None]
"""combine output of _get_refl_data"""
print("Combining tables!")
ds = rs.concat(refl_data)
expt_ids = set(ds.BATCH)
print(f"Found {len(ds)} refls from {len(expt_ids)} expts.")
print("Mapping batch column.")
expt_id_map = {name: i for i, name in enumerate(expt_ids)}
ds.BATCH = [expt_id_map[eid] for eid in ds.BATCH]
ds = ds.infer_mtz_dtypes().set_index(["H", "K", "L"], drop=True)
return ds


def _get_refl_data(fnames, ucell, symbol, rank=0, size=1):
"""

Parameters
----------
fnames: integrated refl fioles
ucell: unit cell tuple (6 params Ang,Ang,Ang,deg,deg,deg)
symbol: space group name e.g. P4
rank: process Id [0-N) where N is num proc
size: total number of proc (N)

Returns
-------
RS dataset (pandas Dataframe)

"""

sg_num = gemmi.find_spacegroup_by_name(symbol).number
all_ds = []

for i_f, f in enumerate(fnames):
if i_f % size != rank:
continue
if rank == 0:
print(f"Loading {i_f+1}/{len(fnames)}")
_, _, R = msgpack.load(open(f, "rb"), strict_map_key=False)
refl_data = R["data"]
expt_id_map = R["identifiers"]
h, k, l = get_msgpack_data(refl_data, "miller_index")
(I,) = get_msgpack_data(refl_data, "intensity.sum.value")
(sigI,) = get_msgpack_data(refl_data, "intensity.sum.variance")
x, y, _ = get_msgpack_data(refl_data, "xyzcal.px")
sx, sy, sz = get_msgpack_data(refl_data, "s1")
(dpsi,) = get_msgpack_data(refl_data, "delpsical.rad")
(local_id,) = get_msgpack_data(refl_data, "id")
global_id = np.array([expt_id_map[li] for li in local_id])
ds = rs.DataSet(
{
"H": h,
"K": k,
"L": l,
"BATCH": global_id,
"DPSI": dpsi,
"I": I,
"SigI": sigI,
"X": x,
"Y": y,
},
cell=ucell,
spacegroup=sg_num,
)
ds["SX"] = sx
ds["SY"] = sy
ds["SZ"] = sz
ds["PARTIAL"] = True
all_ds.append(ds)
if all_ds:
all_ds = rs.concat(all_ds)
else:
all_ds = None
return all_ds


def read_dials_stills(dirnames, ucell, symbol, nj=10):
"""

Parameters
----------
dirnames: folders containing stills process results (integrated.refl)
ucell: unit cell tuple (6 params Ang,Ang,Ang,deg,deg,deg)
symbol: space group name e.g. P4
nj: number of jobs

Returns
-------
RS dataset (pandas Dataframe)
"""
fnames = get_fnames(dirnames)
ray.init(num_cpus=nj)

# get the refl data
get_refl_data = ray.remote(_get_refl_data)
refl_data = ray.get(
[get_refl_data.remote(fnames, ucell, symbol, rank, nj) for rank in range(nj)]
)

ds = _concat(refl_data)
return ds
31 changes: 31 additions & 0 deletions reciprocalspaceship/io/dials_mpi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from mpi4py import MPI

COMM = MPI.COMM_WORLD
from reciprocalspaceship.io import dials


def read_dials_stills_mpi(dirnames, ucell, symbol):
"""

Parameters
----------
dirnames: folders containing stills process results (integrated.refl)
ucell: unit cell tuple (6 params Ang,Ang,Ang,deg,deg,deg)
symbol: space group name e.g. P4

Returns
-------
RS dataset (pandas Dataframe) if MPI rank==0 else None
"""
fnames = None
if COMM.rank == 0:
fnames = dials.get_fnames(dirnames)
fnames = COMM.bcast(fnames)

refl_data = dials._get_refl_data(fnames, ucell, symbol, COMM.rank, COMM.size)
refl_data = COMM.gather(refl_data)
ds = None
if COMM.rank == 0:
ds = dials._concat(refl_data)

return ds
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def getVersionNumber():
},
entry_points={
"console_scripts": [
"rs.mtzdump=reciprocalspaceship.commandline.mtzdump:main",
"rs.mtzdump=reciprocalspaceship.commandline.mtzdump:main"
]
},
classifiers=[
Expand Down