Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow users to access the distances data as a numpy array #135

Merged
merged 1 commit into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
- os: macos-13
open-mp: "OFF"
neon: "OFF"
- os: macos-14
- os: macos-latest
open-mp: "OFF"
neon: "ON"
- os: windows-latest
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ jobs:

strategy:
matrix:
os: [ubuntu-latest, macos-13, macos-14, windows-latest]
os: [ubuntu-latest, macos-13, macos-latest, windows-latest]

steps:
- uses: actions/checkout@v4
with:
submodules: "recursive"
- uses: pypa/cibuildwheel@v2.19
- uses: pypa/cibuildwheel@v2.20
env:
CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux2014_x86_64_cuda_11.8
- uses: actions/upload-artifact@v4
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ repos:
- id: mixed-line-ending

- repo: https://github.com/psf/black
rev: 24.4.2
rev: 24.8.0
hooks:
- id: black

Expand All @@ -29,7 +29,7 @@ repos:
- id: prettier

- repo: https://github.com/python-jsonschema/check-jsonschema
rev: 0.28.6
rev: 0.29.1
hooks:
- id: check-github-workflows
- id: check-readthedocs
Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
cmake_minimum_required(VERSION 3.23..3.29)
cmake_minimum_required(VERSION 3.23..3.30)

project(
hammingdist
VERSION 1.2.0
VERSION 1.3.0
LANGUAGES CXX)

include(CTest)
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ data.dump_sparse("sparse.txt", threshold=3)
# If the `remove_duplicates` option was used, the sequence indices can also be written.
# For each input sequence, this prints the corresponding index in the output:
data.dump_sequence_indices("indices.txt")

# The lower-triangular distance elements can also be directly accessed as a 1-d numpy array:
lt_array = data.lt_array
# The elements in this array correspond to the 2-d indices (row=1,col=0), (row=2,col=0), (row=2,col=1), ...
# These indices can be generated using the numpy tril_indices function, e.g. to construct the lower-triangular matrix:
lt_matrix = np.zeros((n_seq, n_seq))
lt_matrix[np.tril_indices(n_seq, -1)] = lt_array
```

## Duplicates
Expand Down
2 changes: 1 addition & 1 deletion ext/Catch2
Submodule Catch2 updated 133 files
2 changes: 1 addition & 1 deletion ext/benchmark
Submodule benchmark updated 108 files
2 changes: 1 addition & 1 deletion ext/pybind11
Submodule pybind11 updated 50 files
+2 −0 .clang-tidy
+8 −0 .github/workflows/ci.yml
+30 −0 .github/workflows/emscripten.yaml
+1 −1 .github/workflows/format.yml
+1 −1 .github/workflows/pip.yml
+6 −6 .pre-commit-config.yaml
+1 −0 CMakeLists.txt
+1 −1 docs/advanced/cast/eigen.rst
+92 −0 docs/changelog.rst
+3 −3 docs/compiling.rst
+0 −4 docs/limitations.rst
+3 −3 docs/requirements.txt
+9 −2 include/pybind11/cast.h
+19 −3 include/pybind11/detail/common.h
+4 −2 include/pybind11/detail/init.h
+22 −7 include/pybind11/detail/internals.h
+2 −62 include/pybind11/detail/type_caster_base.h
+77 −0 include/pybind11/detail/value_and_holder.h
+0 −3 include/pybind11/eigen/tensor.h
+45 −34 include/pybind11/functional.h
+10 −1 include/pybind11/gil_safe_call_once.h
+6 −2 include/pybind11/numpy.h
+11 −2 include/pybind11/stl/filesystem.h
+1 −1 include/pybind11/stl_bind.h
+7 −2 include/pybind11/typing.h
+26 −3 pybind11/__main__.py
+1 −1 pybind11/_version.py
+10 −1 tests/CMakeLists.txt
+1 −1 tests/constructor_stats.h
+1 −0 tests/extra_python_package/test_files.py
+21 −0 tests/pyproject.toml
+5 −0 tests/test_async.py
+2 −0 tests/test_builtin_casters.py
+3 −0 tests/test_callbacks.py
+11 −5 tests/test_eigen_tensor.inl
+1 −1 tests/test_exceptions.py
+9 −4 tests/test_gil_scoped.py
+4 −0 tests/test_iostream.py
+6 −6 tests/test_modules.cpp
+2 −0 tests/test_numpy_dtypes.cpp
+1 −1 tests/test_opaque_types.cpp
+4 −4 tests/test_pytypes.cpp
+2 −2 tests/test_pytypes.py
+1 −0 tests/test_tagbased_polymorphic.cpp
+5 −0 tests/test_thread.py
+46 −0 tests/test_type_caster_std_function_specializations.cpp
+15 −0 tests/test_type_caster_std_function_specializations.py
+3 −0 tests/test_virtual_functions.py
+29 −3 tools/pybind11Common.cmake
+1 −1 tools/pybind11Config.cmake.in
7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"

[project]
name = "hammingdist"
version = "1.2.0"
version = "1.3.0"
description = "A fast tool to calculate Hamming distances"
readme = "README.md"
license = {text = "MIT"}
Expand All @@ -23,6 +23,7 @@ classifiers=[
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"Operating System :: MacOS :: MacOS X",
Expand All @@ -39,7 +40,7 @@ test = ["pytest", "numpy"]

[tool.scikit-build]
cmake.version = ">=3.23"
cmake.verbose = true
build.verbose = true

[tool.scikit-build.cmake.define]
BUILD_TESTING = "OFF"
Expand All @@ -48,7 +49,7 @@ HAMMING_BUILD_PYTHON = "ON"

[tool.cibuildwheel]
skip = "*-manylinux_i686 *-musllinux*"
test-skip = "pp*"
test-skip = "pp* cp313-manylinux_x86_64"
test-extras = "test"
test-command = "pytest {project}/python/tests -v"
environment = { BLAS="None", LAPACK="None", ATLAS="None" }
Expand Down
10 changes: 8 additions & 2 deletions python/hammingdist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ PYBIND11_MODULE(hammingdist, m) {
&DataSet<DefaultDistIntType>::dump_sequence_indices,
"Dump row index in distances matrix for each input sequence")
.def("__getitem__", &DataSet<DefaultDistIntType>::operator[])
.def_readonly("_distances", &DataSet<DefaultDistIntType>::result);
.def_readonly("_distances", &DataSet<DefaultDistIntType>::result)
.def_property_readonly("lt_array", [](DataSet<DefaultDistIntType> &self) {
return py::array(self.result.size(), self.result.data());
});

py::class_<DataSet<uint16_t>>(m, "DataSetLarge")
.def("dump", &DataSet<uint16_t>::dump,
Expand All @@ -58,7 +61,10 @@ PYBIND11_MODULE(hammingdist, m) {
.def("dump_sequence_indices", &DataSet<uint16_t>::dump_sequence_indices,
"Dump row index in distances matrix for each input sequence")
.def("__getitem__", &DataSet<uint16_t>::operator[])
.def_readonly("_distances", &DataSet<uint16_t>::result);
.def_readonly("_distances", &DataSet<uint16_t>::result)
.def_property_readonly("lt_array", [](DataSet<uint16_t> &self) {
return py::array(self.result.size(), self.result.data());
});

m.def("from_stringlist", &from_stringlist,
"Creates a dataset from a list of strings");
Expand Down
14 changes: 12 additions & 2 deletions python/tests/test_hammingdist.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ def write_fasta_file(filename, sequences):


def check_output_sizes(dat, n_in, n_out, tmp_out_file, fasta_sequence_indices=None):
assert dat.lt_array.shape == (n_out * (n_out - 1) // 2,)

dat.dump(tmp_out_file)
dump = np.loadtxt(tmp_out_file, delimiter=",")
assert len(dump) == n_out
Expand Down Expand Up @@ -97,8 +99,9 @@ def test_from_fasta(from_fasta_func, use_gpu, tmp_path):
)
@pytest.mark.parametrize("max_distance", [0, 1, 2, 3, 89, 497, 9999999])
def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path):
# generate 50 sequences, each with 25 characters
sequences = ["".join(random.choices(chars, k=25)) for i in range(50)]
n_seq = 50
n_chars = 25
sequences = ["".join(random.choices(chars, k=n_chars)) for i in range(n_seq)]
fasta_file = str(tmp_path / "fasta.txt")
write_fasta_file(fasta_file, sequences)
# calculate distances matrix
Expand All @@ -108,6 +111,12 @@ def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path):
include_x=include_x,
max_distance=max_distance,
)
# get lower-triangular data as 1-d array
lt_array = data.lt_array
assert lt_array.shape == (n_seq * (n_seq - 1) // 2,)
# reshape to lower-triangular matrix
lt_matrix = np.zeros((n_seq, n_seq), dtype=np.uint8)
lt_matrix[np.tril_indices(n_seq, -1)] = lt_array
# use each sequence in turn as the reference sequence & calculate reference distances
for i, sequence in enumerate(sequences):
vec = hammingdist.fasta_reference_distances(
Expand All @@ -120,6 +129,7 @@ def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path):
# if x is not included, invalid chars have distance 1 but data[i,i] returns 0 by construction
if include_x or i != j:
assert data[i, j] == min(max_distance, dist)
assert lt_matrix[max(i, j), min(i, j)] == min(max_distance, dist)
# should also agree with output of distance function for these two sequences
assert dist == hammingdist.distance(
sequences[i], sequences[j], include_x=include_x
Expand Down
Loading