From e1d7820f5ed3af8c4bae699595eed31ce55a07de Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 15 Jul 2024 14:55:30 +0100 Subject: [PATCH 01/11] Some testing --- tests/test_cpython_interface.py | 183 ++++++++++++++++++++++++++++++++ vcztools/_vcztoolsmodule.c | 79 ++++++-------- vcztools/vcf_writer.py | 2 +- 3 files changed, 216 insertions(+), 48 deletions(-) diff --git a/tests/test_cpython_interface.py b/tests/test_cpython_interface.py index f406c5a..4f877c1 100644 --- a/tests/test_cpython_interface.py +++ b/tests/test_cpython_interface.py @@ -1,3 +1,5 @@ +import inspect + import pytest import numpy as np @@ -27,6 +29,55 @@ def example_fixed_data(num_variants, num_samples=0): } +def example_encoder(num_variants=1, num_samples=0): + encoder = _vcztools.VcfEncoder( + num_variants, num_samples, **example_fixed_data(num_variants, num_samples) + ) + + return encoder + + +class TestPrintState: + def test_nomimal_case(self, tmp_path): + encoder = example_encoder() + filename = tmp_path / "debug.txt" + with open(filename, "w") as f: + encoder.print_state(f) + with open(filename, "r") as f: + s = f.read() + assert "CHROM" in s + + def test_read_only_file(self, tmp_path): + encoder = example_encoder() + filename = tmp_path / "debug.txt" + with open(filename, "w") as f: + f.write("x") + with open(filename, "r") as f: + with pytest.raises(OSError, match="22"): + encoder.print_state(f) + + @pytest.mark.parametrize("fileobj", [None, "path"]) + def test_bad_file_arg(self, fileobj): + encoder = example_encoder() + with pytest.raises(TypeError): + encoder.print_state(fileobj) + + @pytest.mark.parametrize("fileobj", [1024, 123]) + def test_bad_fileno(self, fileobj): + encoder = example_encoder() + with pytest.raises(OSError, match="9"): + encoder.print_state(fileobj) + + +class TestEncodeOverrun: + def test_fixed_fields(self): + encoder = example_encoder() + s = encoder.encode(0, 1024) + for length in range(len(s)): + with pytest.raises(ValueError, match="-101"): + encoder.encode(0, length) + + class TestTypeChecking: @pytest.mark.parametrize("name", FIXED_FIELD_NAMES) def test_field_incorrect_length(self, name): @@ -51,3 +102,135 @@ def test_field_incorrect_type(self, name): data[name] = "A Python string" with pytest.raises(TypeError, match=f"must be numpy.ndarray"): _vcztools.VcfEncoder(num_variants, 0, **data) + + +class TestAddFields: + def test_add_info_field_bad_num_args(self): + encoder = example_encoder() + with pytest.raises(TypeError): + encoder.add_info_field() + with pytest.raises(TypeError): + encoder.add_info_field("name") + + def test_add_format_field_bad_num_args(self): + encoder = example_encoder() + with pytest.raises(TypeError): + encoder.add_format_field() + with pytest.raises(TypeError): + encoder.add_format_field("name") + + def test_add_gt_field_bad_num_args(self): + encoder = example_encoder() + with pytest.raises(TypeError): + encoder.add_gt_field() + with pytest.raises(TypeError): + encoder.add_gt_field(np.zeros((1), dtype=bool)) + + @pytest.mark.parametrize("name", [None, 1234, {}]) + def test_add_info_field_bad_name_type(self, name): + encoder = example_encoder() + with pytest.raises(TypeError, match="must be str"): + encoder.add_info_field(name, np.array([1])) + + @pytest.mark.parametrize("name", [None, 1234, {}]) + def test_add_format_field_bad_name_type(self, name): + encoder = example_encoder() + with pytest.raises(TypeError, match="must be str"): + encoder.add_format_field(name, np.array([1])) + + @pytest.mark.parametrize("array", [None, 1234, {}]) + def test_add_info_field_bad_array_type(self, array): + encoder = example_encoder() + with pytest.raises(TypeError, match="ndarray"): + encoder.add_info_field("name", array) + + @pytest.mark.parametrize("array", [None, 1234, {}]) + def test_add_format_field_bad_array_type(self, array): + encoder = example_encoder() + with pytest.raises(TypeError, match="ndarray"): + encoder.add_format_field("name", array) + + @pytest.mark.parametrize("array", [None, 1234, {}]) + def test_add_gt_phased_field_bad_array_type(self, array): + encoder = example_encoder(1, 1) + with pytest.raises(TypeError, match="ndarray"): + encoder.add_gt_field(np.zeros((1, 1, 1), dtype=np.int8), array) + + @pytest.mark.parametrize("array", [None, 1234, {}]) + def test_add_gt_field_bad_array_type(self, array): + encoder = example_encoder(1, 1) + with pytest.raises(TypeError, match="ndarray"): + encoder.add_gt_field(array, np.zeros((1, 1), dtype=bool)) + + def test_add_info_field_bad_array_num_variants(self): + num_variants = 10 + encoder = example_encoder(num_variants) + with pytest.raises(ValueError, match="number of variants"): + encoder.add_info_field("name", np.zeros((3, 1), dtype=bool)) + + def test_add_format_field_bad_array_num_variants(self): + num_variants = 10 + encoder = example_encoder(num_variants) + with pytest.raises(ValueError, match="number of variants"): + encoder.add_format_field("name", np.zeros((3, 1, 1), dtype=bool)) + + def test_add_format_field_bad_array_num_samples(self): + encoder = example_encoder(5, 2) + with pytest.raises(ValueError, match="number of samples"): + encoder.add_format_field("name", np.zeros((5, 1, 1), dtype=bool)) + + @pytest.mark.parametrize("shape", [(), (5,), (5, 1, 1)]) + def test_add_info_field_wrong_dimension(self, shape): + encoder = example_encoder(5) + with pytest.raises(ValueError, match="wrong dimension"): + encoder.add_info_field("name", np.zeros(shape, dtype=bool)) + + @pytest.mark.parametrize("shape", [(), (5,), (5, 1), (5, 1, 1, 1)]) + def test_add_format_field_wrong_dimension(self, shape): + encoder = example_encoder(5) + with pytest.raises(ValueError, match="wrong dimension"): + encoder.add_format_field("name", np.zeros(shape, dtype=bool)) + + @pytest.mark.parametrize("dtype", ["m", "c16", "O", "u1"]) + def test_add_info_field_unsupported_dtype(self, dtype): + encoder = example_encoder(1) + with pytest.raises(ValueError, match="unsupported array dtype"): + encoder.add_info_field("name", np.zeros((1, 1), dtype=dtype)) + + @pytest.mark.parametrize("dtype", ["m", "c16", "O", "u1"]) + def test_add_format_field_unsupported_dtype(self, dtype): + encoder = example_encoder(1, 1) + with pytest.raises(ValueError, match="unsupported array dtype"): + encoder.add_format_field("name", np.zeros((1, 1, 1), dtype=dtype)) + + @pytest.mark.parametrize("dtype", ["i8", "f8"]) + def test_add_info_field_unsupported_width(self, dtype): + encoder = example_encoder(1) + with pytest.raises(ValueError, match="-203"): + encoder.add_info_field("name", np.zeros((1, 1), dtype=dtype)) + + @pytest.mark.parametrize("dtype", ["i8", "f8"]) + def test_add_format_field_unsupported_width(self, dtype): + encoder = example_encoder(1, 1) + with pytest.raises(ValueError, match="-203"): + encoder.add_format_field("name", np.zeros((1, 1, 1), dtype=dtype)) + + +class TestUninitialised: + @pytest.mark.parametrize( + "name", + ["add_info_field", "add_gt_field", "add_format_field", "print_state", "encode"], + ) + def test_methods(self, name): + cls = _vcztools.VcfEncoder + encoder = cls.__new__(cls) + method = getattr(encoder, name) + with pytest.raises(SystemError, match="not initialised"): + method() + + @pytest.mark.parametrize("name", ["arrays"]) + def test_attrs(self, name): + cls = _vcztools.VcfEncoder + encoder = cls.__new__(cls) + with pytest.raises(SystemError, match="not initialised"): + getattr(encoder, name) diff --git a/vcztools/_vcztoolsmodule.c b/vcztools/_vcztoolsmodule.c index e9c484d..73aefc3 100644 --- a/vcztools/_vcztoolsmodule.c +++ b/vcztools/_vcztoolsmodule.c @@ -4,6 +4,7 @@ #include #include /* for offsetof() */ +#include #include #include "vcf_encoder.h" @@ -273,7 +274,7 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) static int VcfEncoder_add_field_array(VcfEncoder *self, const char *name, PyArrayObject *array, - npy_intp dimension, const char *prefix) + npy_intp dimension, const char *prefix, bool is_format_field) { int ret = -1; npy_intp *shape; @@ -297,6 +298,15 @@ VcfEncoder_add_field_array(VcfEncoder *self, const char *name, PyArrayObject *ar "Array must have first dimension equal to number of variants"); goto out; } + if (is_format_field) { + if (shape[1] != (npy_intp) self->vcf_encoder->num_samples) { + PyErr_SetString(PyExc_ValueError, + "Array must have second dimension equal to number of samples"); + goto out; + } + } + + if (VcfEncoder_add_array(self, prefix, name, array) != 0) { goto out; @@ -341,10 +351,13 @@ VcfEncoder_add_info_field(VcfEncoder *self, PyObject *args) const char *name; int err, type; + if (VcfEncoder_check_state(self) != 0) { + goto out; + } if (!PyArg_ParseTuple(args, "sO!", &name, &PyArray_Type, &array)) { goto out; } - if (VcfEncoder_add_field_array(self, name, array, 2, "INFO/") != 0) { + if (VcfEncoder_add_field_array(self, name, array, 2, "INFO/", false) != 0) { goto out; } type = np_type_to_vcz_type(name, array); @@ -353,7 +366,6 @@ VcfEncoder_add_info_field(VcfEncoder *self, PyObject *args) } err = vcz_variant_encoder_add_info_field(self->vcf_encoder, name, type, PyArray_ITEMSIZE(array), PyArray_DIMS(array)[1], PyArray_DATA(array)); - if (err < 0) { handle_library_error(err); goto out; @@ -371,10 +383,13 @@ VcfEncoder_add_format_field(VcfEncoder *self, PyObject *args) const char *name; int err, type; + if (VcfEncoder_check_state(self) != 0) { + goto out; + } if (!PyArg_ParseTuple(args, "sO!", &name, &PyArray_Type, &array)) { goto out; } - if (VcfEncoder_add_field_array(self, name, array, 3, "FORMAT/") != 0) { + if (VcfEncoder_add_field_array(self, name, array, 3, "FORMAT/", true) != 0) { goto out; } type = np_type_to_vcz_type(name, array); @@ -397,10 +412,8 @@ VcfEncoder_add_gt_field(VcfEncoder *self, PyObject *args) { PyArrayObject *gt = NULL; PyArrayObject *gt_phased = NULL; - PyArrayObject *array; PyObject *ret = NULL; int err; - npy_intp *shape; if (VcfEncoder_check_state(self) != 0) { goto out; @@ -408,45 +421,10 @@ VcfEncoder_add_gt_field(VcfEncoder *self, PyObject *args) if (!PyArg_ParseTuple(args, "O!O!", &PyArray_Type, >, &PyArray_Type, >_phased)) { goto out; } - - array = gt; - assert(PyArray_CheckExact(array)); - if (!PyArray_CHKFLAGS(array, NPY_ARRAY_IN_ARRAY)) { - PyErr_SetString(PyExc_ValueError, "Array must have NPY_ARRAY_IN_ARRAY flags."); - goto out; - } - if (PyArray_NDIM(array) != 3) { - PyErr_SetString(PyExc_ValueError, "Array wrong dimension"); - goto out; - } - shape = PyArray_DIMS(array); - if (shape[0] != (npy_intp) self->vcf_encoder->num_variants) { - PyErr_SetString(PyExc_ValueError, - "Array must have first dimension equal to number of variants"); - goto out; - } - - if (VcfEncoder_add_array(self, "FORMAT/", "GT", array) != 0) { + if (VcfEncoder_add_field_array(self, "GT", gt, 3, "FORMAT/", true) != 0) { goto out; } - - array = gt_phased; - assert(PyArray_CheckExact(array)); - if (!PyArray_CHKFLAGS(array, NPY_ARRAY_IN_ARRAY)) { - PyErr_SetString(PyExc_ValueError, "Array must have NPY_ARRAY_IN_ARRAY flags."); - goto out; - } - if (PyArray_NDIM(array) != 2) { - PyErr_SetString(PyExc_ValueError, "Array wrong dimension"); - goto out; - } - shape = PyArray_DIMS(array); - if (shape[0] != (npy_intp) self->vcf_encoder->num_variants) { - PyErr_SetString(PyExc_ValueError, - "Array must have first dimension equal to number of variants"); - goto out; - } - if (VcfEncoder_add_array(self, "FORMAT/", "GT_phased", array) != 0) { + if (VcfEncoder_add_field_array(self, "GT_phased", gt_phased, 2, "FORMAT/", true) != 0) { goto out; } err = vcz_variant_encoder_add_gt_field(self->vcf_encoder, PyArray_ITEMSIZE(gt), @@ -461,7 +439,7 @@ VcfEncoder_add_gt_field(VcfEncoder *self, PyObject *args) } static PyObject * -VcfEncoder_encode_row(VcfEncoder *self, PyObject *args) +VcfEncoder_encode(VcfEncoder *self, PyObject *args) { PyObject *ret = NULL; int row; @@ -523,7 +501,14 @@ VcfEncoder_print_state(VcfEncoder *self, PyObject *args) static PyObject * VcfEncoder_getarrays(VcfEncoder *self, void *closure) { - return PyDict_Copy(self->arrays); + PyObject *ret = NULL; + + if (VcfEncoder_check_state(self) != 0) { + goto out; + } + ret = PyDict_Copy(self->arrays); +out: + return ret; } static PyGetSetDef VcfEncoder_getsetters[] = { @@ -549,8 +534,8 @@ static PyMethodDef VcfEncoder_methods[] = { .ml_meth = (PyCFunction) VcfEncoder_add_format_field, .ml_flags = METH_VARARGS, .ml_doc = "Add a format field to the encoder" }, - { .ml_name = "encode_row", - .ml_meth = (PyCFunction) VcfEncoder_encode_row, + { .ml_name = "encode", + .ml_meth = (PyCFunction) VcfEncoder_encode, .ml_flags = METH_VARARGS, .ml_doc = "Return the specified row of VCF text" }, { NULL } /* Sentinel */ diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index f6d8187..b383df1 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -260,7 +260,7 @@ def c_chunk_to_vcf(root, v_chunk, contigs, filters, output): encoder.add_format_field(name, array) for j in range(num_variants): - line = encoder.encode_row(j, 2**20) + line = encoder.encode(j, 2**20) print(line, file=output) From cb76a281e798e4a8ba91c7af5896188bb0ab8b1b Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 15 Jul 2024 14:56:48 +0100 Subject: [PATCH 02/11] Some lint --- tests/test_cpython_interface.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_cpython_interface.py b/tests/test_cpython_interface.py index 4f877c1..aa17841 100644 --- a/tests/test_cpython_interface.py +++ b/tests/test_cpython_interface.py @@ -1,5 +1,3 @@ -import inspect - import pytest import numpy as np From 0a11f8836b0a707f14b910ba92be585dd30492c4 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 16 Jul 2024 11:34:55 +0100 Subject: [PATCH 03/11] Tighten up testing on fixed fields --- tests/test_cpython_interface.py | 33 ++++- vcztools/_vcztoolsmodule.c | 253 +++++++++++++++----------------- 2 files changed, 147 insertions(+), 139 deletions(-) diff --git a/tests/test_cpython_interface.py b/tests/test_cpython_interface.py index aa17841..5a7e855 100644 --- a/tests/test_cpython_interface.py +++ b/tests/test_cpython_interface.py @@ -76,31 +76,52 @@ def test_fixed_fields(self): encoder.encode(0, length) -class TestTypeChecking: +class TestFixedFieldInputChecking: @pytest.mark.parametrize("name", FIXED_FIELD_NAMES) def test_field_incorrect_length(self, name): num_variants = 5 data = example_fixed_data(num_variants) data[name] = data[name][1:] - with pytest.raises(ValueError, match=f"Array {name.upper()} must have "): + with pytest.raises(ValueError, match=f"Array {name} must have "): _vcztools.VcfEncoder(num_variants, 0, **data) - @pytest.mark.parametrize("name", FIXED_FIELD_NAMES) + @pytest.mark.parametrize("name", FIXED_FIELD_NAMES + ["filter_ids"]) def test_field_incorrect_dtype(self, name): num_variants = 5 data = example_fixed_data(num_variants) data[name] = np.zeros(data[name].shape, dtype=np.int64) - with pytest.raises(ValueError, match=f"Wrong dtype for {name.upper()}"): + with pytest.raises(ValueError, match=f"Wrong dtype for {name}"): _vcztools.VcfEncoder(num_variants, 0, **data) - @pytest.mark.parametrize("name", FIXED_FIELD_NAMES) + @pytest.mark.parametrize("name", FIXED_FIELD_NAMES + ["filter_ids"]) def test_field_incorrect_type(self, name): num_variants = 5 data = example_fixed_data(num_variants) data[name] = "A Python string" - with pytest.raises(TypeError, match=f"must be numpy.ndarray"): + with pytest.raises(TypeError, match="must be numpy.ndarray"): _vcztools.VcfEncoder(num_variants, 0, **data) + @pytest.mark.parametrize("name", ["id", "alt"]) + def test_zero_columns(self, name): + data = example_fixed_data(1) + arr = np.frombuffer(b"", dtype="S1").reshape(1, 0) + data[name] = arr + with pytest.raises(ValueError, match="-204"): + _vcztools.VcfEncoder(1, 0, **data) + + def test_zero_filter_ids(self): + data = example_fixed_data(1) + data["filter_ids"] = np.array([], dtype="S") + data["filter"] = np.array([], dtype=bool).reshape(1, 0) + with pytest.raises(ValueError, match="-204"): + _vcztools.VcfEncoder(1, 0, **data) + + def test_incorrect_num_filter_ids(self): + data = example_fixed_data(1) + data["filter_ids"] = np.array(["PASS", "1"], dtype="S") + with pytest.raises(ValueError, match="filters dimension must be"): + _vcztools.VcfEncoder(1, 0, **data) + class TestAddFields: def test_add_info_field_bad_num_args(self): diff --git a/vcztools/_vcztoolsmodule.c b/vcztools/_vcztoolsmodule.c index 73aefc3..140bab9 100644 --- a/vcztools/_vcztoolsmodule.c +++ b/vcztools/_vcztoolsmodule.c @@ -98,33 +98,104 @@ VcfEncoder_add_array( } static int -VcfEncoder_store_fixed_array(VcfEncoder *self, PyArrayObject *array, const char *name, - int type, int dimension, int num_variants) +check_array(const char *name, PyArrayObject *array, npy_intp dimension) { + int ret = -1; - npy_intp *shape; assert(PyArray_CheckExact(array)); if (!PyArray_CHKFLAGS(array, NPY_ARRAY_IN_ARRAY)) { - PyErr_SetString(PyExc_ValueError, "Array must have NPY_ARRAY_IN_ARRAY flags."); + PyErr_Format( + PyExc_ValueError, "Array %s must have NPY_ARRAY_IN_ARRAY flags.", name); goto out; } if (PyArray_NDIM(array) != dimension) { - PyErr_Format(PyExc_ValueError, "Array %s is of the wrong dimension", name); + PyErr_Format(PyExc_ValueError, "Array %s has wrong dimension: %d != %d", name, + (int) PyArray_NDIM(array), (int) dimension); + goto out; + } + ret = 0; +out: + return ret; +} + +static int +VcfEncoder_add_field_array(VcfEncoder *self, const char *name, PyArrayObject *array, + npy_intp dimension, const char *prefix, bool is_format_field) +{ + int ret = -1; + npy_intp *shape; + + if (check_array(name, array, dimension) != 0) { goto out; } shape = PyArray_DIMS(array); - if (shape[0] != (npy_intp) num_variants) { + if (shape[0] != (npy_intp) self->vcf_encoder->num_variants) { PyErr_Format(PyExc_ValueError, "Array %s must have first dimension equal to number of variants", name); goto out; } + if (is_format_field) { + if (shape[1] != (npy_intp) self->vcf_encoder->num_samples) { + PyErr_Format(PyExc_ValueError, + "Array %s must have second dimension equal to number of samples", name); + goto out; + } + } + if (VcfEncoder_add_array(self, prefix, name, array) != 0) { + goto out; + } + ret = 0; +out: + return ret; +} + +static int +np_type_to_vcz_type(const char *name, PyArrayObject *array) +{ + int ret = -1; + + switch (PyArray_DTYPE(array)->kind) { + case 'i': + ret = VCZ_TYPE_INT; + break; + case 'f': + ret = VCZ_TYPE_FLOAT; + break; + case 'S': + ret = VCZ_TYPE_STRING; + break; + case 'b': + ret = VCZ_TYPE_BOOL; + break; + default: + PyErr_Format( + PyExc_ValueError, "Array '%s' has unsupported array dtype", name); + goto out; + } +out: + return ret; +} + +static int +check_dtype(const char *name, PyArrayObject *array, int type) { if (PyArray_DTYPE(array)->type_num != type) { PyErr_Format(PyExc_ValueError, "Wrong dtype for %s", name); - goto out; + return -1; } + return 0; +} - if (VcfEncoder_add_array(self, "", name, array) != 0) { +static int +VcfEncoder_store_fixed_array( + VcfEncoder *self, PyArrayObject *array, const char *name, int type, int dimension) +{ + int ret = -1; + + if (VcfEncoder_add_field_array(self, name, array, dimension, "", false) != 0) { + goto out; + } + if (check_dtype(name, array, type) != 0) { goto out; } ret = 0; @@ -166,183 +237,98 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) goto out; } - if (VcfEncoder_store_fixed_array(self, chrom, "CHROM", NPY_STRING, 1, num_variants) - != 0) { - goto out; - } - if (VcfEncoder_store_fixed_array(self, pos, "POS", NPY_INT32, 1, num_variants) - != 0) { - goto out; - } - if (VcfEncoder_store_fixed_array(self, id, "ID", NPY_STRING, 2, num_variants) != 0) { - goto out; - } - if (VcfEncoder_store_fixed_array(self, ref, "REF", NPY_STRING, 1, num_variants) - != 0) { - goto out; - } - if (VcfEncoder_store_fixed_array(self, alt, "ALT", NPY_STRING, 2, num_variants) - != 0) { - goto out; - } - if (VcfEncoder_store_fixed_array(self, qual, "QUAL", NPY_FLOAT32, 1, num_variants) - != 0) { + err = vcz_variant_encoder_init(self->vcf_encoder, num_variants, num_samples); + if (err < 0) { + handle_library_error(err); goto out; } - /* NOTE: we could generalise this pattern for CHROM also to save a bit of time - * in building numpy String arrays */ - assert(PyArray_CheckExact(filter_ids)); - if (!PyArray_CHKFLAGS(filter_ids, NPY_ARRAY_IN_ARRAY)) { - PyErr_SetString(PyExc_ValueError, "Array must have NPY_ARRAY_IN_ARRAY flags."); + if (VcfEncoder_store_fixed_array(self, chrom, "chrom", NPY_STRING, 1) != 0) { goto out; } - if (PyArray_DTYPE(filter_ids)->type_num != NPY_STRING) { - PyErr_Format(PyExc_ValueError, "filter_ids is not of the correct type"); + if (VcfEncoder_store_fixed_array(self, pos, "pos", NPY_INT32, 1) != 0) { goto out; } - - if (PyArray_NDIM(filter_ids) != 1) { - PyErr_Format(PyExc_ValueError, "filter_ids must be 1D"); + if (VcfEncoder_store_fixed_array(self, id, "id", NPY_STRING, 2) != 0) { goto out; } - num_filters = PyArray_DIMS(filter_ids)[0]; - if (VcfEncoder_add_array(self, "IDS/", "FILTER", filter_ids) != 0) { + if (VcfEncoder_store_fixed_array(self, ref, "ref", NPY_STRING, 1) != 0) { goto out; } - if (VcfEncoder_store_fixed_array(self, filter, "FILTER", NPY_BOOL, 2, num_variants) - != 0) { + if (VcfEncoder_store_fixed_array(self, alt, "alt", NPY_STRING, 2) != 0) { goto out; } - if (PyArray_DIMS(filter)[1] != num_filters) { - PyErr_Format( - PyExc_ValueError, "filters dimension must be (num_variants, num_filters)"); + if (VcfEncoder_store_fixed_array(self, qual, "qual", NPY_FLOAT32, 1) != 0) { goto out; } - err = vcz_variant_encoder_init(self->vcf_encoder, num_variants, num_samples); - if (err < 0) { - handle_library_error(err); - goto out; - } - err = vcz_variant_encoder_add_chrom_field( + /* These calls cannot fail, so we don't check the output. */ + vcz_variant_encoder_add_pos_field(self->vcf_encoder, PyArray_DATA(pos)); + vcz_variant_encoder_add_qual_field(self->vcf_encoder, PyArray_DATA(qual)); + + /* In practise, these calls can't fail either because ITEMSIZE is always > 0. + * We keep the checks in case this changes for later versions of numpy. + */ + vcz_variant_encoder_add_chrom_field( self->vcf_encoder, PyArray_ITEMSIZE(chrom), PyArray_DATA(chrom)); if (err < 0) { + // TODO ADD NOCOVER handle_library_error(err); goto out; } - err = vcz_variant_encoder_add_pos_field(self->vcf_encoder, PyArray_DATA(pos)); + vcz_variant_encoder_add_ref_field( + self->vcf_encoder, PyArray_ITEMSIZE(ref), PyArray_DATA(ref)); if (err < 0) { + // TODO ADD NOCOVER handle_library_error(err); goto out; } + /* Note: the only provokable error here is the zero-sized second dimension, + * which we could catch easily above */ err = vcz_variant_encoder_add_id_field( self->vcf_encoder, PyArray_ITEMSIZE(id), PyArray_DIMS(id)[1], PyArray_DATA(id)); if (err < 0) { handle_library_error(err); goto out; } - err = vcz_variant_encoder_add_ref_field( - self->vcf_encoder, PyArray_ITEMSIZE(ref), PyArray_DATA(ref)); - if (err < 0) { - handle_library_error(err); - goto out; - } err = vcz_variant_encoder_add_alt_field(self->vcf_encoder, PyArray_ITEMSIZE(alt), PyArray_DIMS(alt)[1], PyArray_DATA(alt)); if (err < 0) { handle_library_error(err); goto out; } - err = vcz_variant_encoder_add_qual_field(self->vcf_encoder, PyArray_DATA(qual)); - if (err < 0) { - handle_library_error(err); - goto out; - } - err = vcz_variant_encoder_add_filter_field(self->vcf_encoder, - PyArray_ITEMSIZE(filter_ids), num_filters, PyArray_DATA(filter_ids), - PyArray_DATA(filter)); - if (err < 0) { - handle_library_error(err); - goto out; - } - - ret = 0; -out: - return ret; -} -static int -VcfEncoder_add_field_array(VcfEncoder *self, const char *name, PyArrayObject *array, - npy_intp dimension, const char *prefix, bool is_format_field) -{ - int ret = -1; - npy_intp *shape; - - if (VcfEncoder_check_state(self) != 0) { + /* NOTE: we could generalise this pattern for CHROM also to save a bit of time + * in building numpy String arrays */ + if (check_array("filter_ids", filter_ids, 1) != 0) { goto out; } - - assert(PyArray_CheckExact(array)); - if (!PyArray_CHKFLAGS(array, NPY_ARRAY_IN_ARRAY)) { - PyErr_SetString(PyExc_ValueError, "Array must have NPY_ARRAY_IN_ARRAY flags."); + if (check_dtype("filter_ids", filter_ids, NPY_STRING) != 0) { goto out; } - if (PyArray_NDIM(array) != dimension) { - PyErr_SetString(PyExc_ValueError, "Array wrong dimension"); + num_filters = PyArray_DIMS(filter_ids)[0]; + if (VcfEncoder_add_array(self, "ids/", "filter", filter_ids) != 0) { goto out; } - shape = PyArray_DIMS(array); - if (shape[0] != (npy_intp) self->vcf_encoder->num_variants) { - PyErr_SetString(PyExc_ValueError, - "Array must have first dimension equal to number of variants"); + if (VcfEncoder_store_fixed_array(self, filter, "filter", NPY_BOOL, 2) != 0) { goto out; } - if (is_format_field) { - if (shape[1] != (npy_intp) self->vcf_encoder->num_samples) { - PyErr_SetString(PyExc_ValueError, - "Array must have second dimension equal to number of samples"); - goto out; - } + if (PyArray_DIMS(filter)[1] != num_filters) { + PyErr_Format( + PyExc_ValueError, "filters dimension must be (num_variants, num_filters)"); + goto out; } - - - - if (VcfEncoder_add_array(self, prefix, name, array) != 0) { + err = vcz_variant_encoder_add_filter_field(self->vcf_encoder, + PyArray_ITEMSIZE(filter_ids), num_filters, PyArray_DATA(filter_ids), + PyArray_DATA(filter)); + if (err < 0) { + handle_library_error(err); goto out; } ret = 0; out: return ret; } - -static int -np_type_to_vcz_type(const char *name, PyArrayObject *array) -{ - int ret = -1; - - switch (PyArray_DTYPE(array)->kind) { - case 'i': - ret = VCZ_TYPE_INT; - break; - case 'f': - ret = VCZ_TYPE_FLOAT; - break; - case 'S': - ret = VCZ_TYPE_STRING; - break; - case 'b': - ret = VCZ_TYPE_BOOL; - break; - default: - PyErr_Format( - PyExc_ValueError, "Array '%s' has unsupported array dtype", name); - goto out; - } -out: - return ret; -} - static PyObject * VcfEncoder_add_info_field(VcfEncoder *self, PyObject *args) { @@ -424,7 +410,8 @@ VcfEncoder_add_gt_field(VcfEncoder *self, PyObject *args) if (VcfEncoder_add_field_array(self, "GT", gt, 3, "FORMAT/", true) != 0) { goto out; } - if (VcfEncoder_add_field_array(self, "GT_phased", gt_phased, 2, "FORMAT/", true) != 0) { + if (VcfEncoder_add_field_array(self, "GT_phased", gt_phased, 2, "FORMAT/", true) + != 0) { goto out; } err = vcz_variant_encoder_add_gt_field(self->vcf_encoder, PyArray_ITEMSIZE(gt), From 75da01fd92fd2a21ea226d6491128d59ac642f2c Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 16 Jul 2024 12:28:58 +0100 Subject: [PATCH 04/11] Some round-trip testing of array storage --- tests/test_cpython_interface.py | 64 +++++++++++++++++++++++++++++++-- vcztools/_vcztoolsmodule.c | 9 ++--- 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/tests/test_cpython_interface.py b/tests/test_cpython_interface.py index 5a7e855..187dc81 100644 --- a/tests/test_cpython_interface.py +++ b/tests/test_cpython_interface.py @@ -27,11 +27,38 @@ def example_fixed_data(num_variants, num_samples=0): } -def example_encoder(num_variants=1, num_samples=0): +def example_info_data(num_variants): + d = {} + for num_columns in range(1, 3): + for dtype in ["i1", "i2", "i4", "f4", "?", "S1"]: + name = f"I{dtype}_{num_columns}" + data = np.arange(num_variants * num_columns).astype(dtype) + d[name] = data.reshape((num_variants, num_columns)) + return d + + +def example_format_data(num_variants, num_samples): + d = {} + for num_columns in range(1, 3): + for dtype in ["i1", "i2", "i4", "f4", "?", "S1"]: + name = f"F{dtype}_{num_columns}" + data = np.arange(num_variants * num_samples * num_columns).astype(dtype) + d[name] = data.reshape((num_variants, num_samples, num_columns)) + return d + + +def example_encoder(num_variants=1, num_samples=0, add_info=True): encoder = _vcztools.VcfEncoder( num_variants, num_samples, **example_fixed_data(num_variants, num_samples) ) - + if add_info: + for name, data in example_info_data(num_variants).items(): + encoder.add_info_field(name, data) + if num_samples > 0: + for name, data in example_format_data(num_variants, num_samples).items(): + encoder.add_format_field(name, data) + # import sys + # encoder.print_state(sys.stdout) return encoder @@ -235,6 +262,39 @@ def test_add_format_field_unsupported_width(self, dtype): encoder.add_format_field("name", np.zeros((1, 1, 1), dtype=dtype)) +class TestArrays: + def test_stored_data_equal(self): + num_variants = 20 + num_samples = 10 + fixed_data = example_fixed_data(num_variants, num_samples) + encoder = _vcztools.VcfEncoder(num_variants, num_samples, **fixed_data) + info_data = example_info_data(num_variants) + format_data = example_format_data(num_variants, num_samples) + for name, data in info_data.items(): + encoder.add_info_field(name, data) + for name, data in format_data.items(): + encoder.add_format_field(name, data) + all_data = {**fixed_data} + for name, array in info_data.items(): + all_data[f"INFO/{name}"] = array + for name, array in format_data.items(): + all_data[f"FORMAT/{name}"] = array + encoder_arrays = encoder.arrays + assert set(encoder.arrays.keys()) == set(all_data.keys()) + for name in all_data.keys(): + # Strong identity assertion here for now, but this may change + assert all_data[name] is encoder_arrays[name] + + def test_array_bad_alignment(self): + fixed_data = example_fixed_data(1) + a = fixed_data["pos"] + # Value is not aligned + a = np.frombuffer(b"\0\0\0\0\0", dtype=np.int32, offset=1) + fixed_data["pos"] = a + with pytest.raises(ValueError, match="NPY_ARRAY_IN_ARRAY"): + _vcztools.VcfEncoder(1, 0, **fixed_data) + + class TestUninitialised: @pytest.mark.parametrize( "name", diff --git a/vcztools/_vcztoolsmodule.c b/vcztools/_vcztoolsmodule.c index 140bab9..de4f18f 100644 --- a/vcztools/_vcztoolsmodule.c +++ b/vcztools/_vcztoolsmodule.c @@ -100,7 +100,6 @@ VcfEncoder_add_array( static int check_array(const char *name, PyArrayObject *array, npy_intp dimension) { - int ret = -1; assert(PyArray_CheckExact(array)); @@ -142,10 +141,7 @@ VcfEncoder_add_field_array(VcfEncoder *self, const char *name, PyArrayObject *ar goto out; } } - if (VcfEncoder_add_array(self, prefix, name, array) != 0) { - goto out; - } - ret = 0; + ret = VcfEncoder_add_array(self, prefix, name, array); out: return ret; } @@ -307,7 +303,7 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) goto out; } num_filters = PyArray_DIMS(filter_ids)[0]; - if (VcfEncoder_add_array(self, "ids/", "filter", filter_ids) != 0) { + if (VcfEncoder_add_array(self, "", "filter_ids", filter_ids) != 0) { goto out; } if (VcfEncoder_store_fixed_array(self, filter, "filter", NPY_BOOL, 2) != 0) { @@ -414,6 +410,7 @@ VcfEncoder_add_gt_field(VcfEncoder *self, PyObject *args) != 0) { goto out; } + // CHECK TYPES err = vcz_variant_encoder_add_gt_field(self->vcf_encoder, PyArray_ITEMSIZE(gt), PyArray_DIMS(gt)[2], PyArray_DATA(gt), PyArray_DATA(gt_phased)); if (err != 0) { From f66fc24db1034b8cf10ef21e1e8ba21f49689e94 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 16 Jul 2024 13:11:38 +0100 Subject: [PATCH 05/11] Add some testing for add_gt --- tests/test_cpython_interface.py | 69 +++++++++++++++++++++++++++++++++ vcztools/_vcztoolsmodule.c | 14 +++++-- 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/tests/test_cpython_interface.py b/tests/test_cpython_interface.py index 187dc81..a2c5bf6 100644 --- a/tests/test_cpython_interface.py +++ b/tests/test_cpython_interface.py @@ -220,11 +220,41 @@ def test_add_format_field_bad_array_num_variants(self): with pytest.raises(ValueError, match="number of variants"): encoder.add_format_field("name", np.zeros((3, 1, 1), dtype=bool)) + def test_add_gt_field_bad_gt_array_num_variants(self): + num_variants = 10 + encoder = example_encoder(num_variants, 1) + with pytest.raises(ValueError, match="number of variants"): + encoder.add_gt_field( + np.zeros((2, 1, 1), dtype=np.int8), np.zeros((10, 1), dtype=bool) + ) + + def test_add_gt_field_bad_gt_phased_array_num_variants(self): + num_variants = 10 + encoder = example_encoder(num_variants, 1) + with pytest.raises(ValueError, match="number of variants"): + encoder.add_gt_field( + np.zeros((10, 1, 1), dtype=np.int8), np.zeros((2, 1), dtype=bool) + ) + def test_add_format_field_bad_array_num_samples(self): encoder = example_encoder(5, 2) with pytest.raises(ValueError, match="number of samples"): encoder.add_format_field("name", np.zeros((5, 1, 1), dtype=bool)) + def test_add_gt_field_bad_gt_array_num_samples(self): + encoder = example_encoder(5, 2) + with pytest.raises(ValueError, match="number of samples"): + encoder.add_gt_field( + np.zeros((5, 1, 1), dtype=np.int8), np.zeros((5, 2), dtype=bool) + ) + + def test_add_gt_field_bad_gt_phased_array_num_samples(self): + encoder = example_encoder(5, 2) + with pytest.raises(ValueError, match="number of samples"): + encoder.add_gt_field( + np.zeros((5, 2, 1), dtype=np.int8), np.zeros((5, 1), dtype=bool) + ) + @pytest.mark.parametrize("shape", [(), (5,), (5, 1, 1)]) def test_add_info_field_wrong_dimension(self, shape): encoder = example_encoder(5) @@ -237,6 +267,22 @@ def test_add_format_field_wrong_dimension(self, shape): with pytest.raises(ValueError, match="wrong dimension"): encoder.add_format_field("name", np.zeros(shape, dtype=bool)) + @pytest.mark.parametrize("shape", [(), (5,), (5, 2), (5, 2, 1, 1)]) + def test_add_gt_field_gt_wrong_dimension(self, shape): + encoder = example_encoder(5, 2) + with pytest.raises(ValueError, match="wrong dimension"): + encoder.add_gt_field( + np.zeros(shape, dtype=np.int8), np.zeros((5, 2), dtype=bool) + ) + + @pytest.mark.parametrize("shape", [(), (5,), (5, 2, 1), (5, 2, 1, 1)]) + def test_add_gt_field_gt_phased_wrong_dimension(self, shape): + encoder = example_encoder(5, 2) + with pytest.raises(ValueError, match="wrong dimension"): + encoder.add_gt_field( + np.zeros((5, 2, 1), dtype=np.int8), np.zeros(shape, dtype=bool) + ) + @pytest.mark.parametrize("dtype", ["m", "c16", "O", "u1"]) def test_add_info_field_unsupported_dtype(self, dtype): encoder = example_encoder(1) @@ -249,6 +295,22 @@ def test_add_format_field_unsupported_dtype(self, dtype): with pytest.raises(ValueError, match="unsupported array dtype"): encoder.add_format_field("name", np.zeros((1, 1, 1), dtype=dtype)) + @pytest.mark.parametrize("dtype", ["m", "c16", "O", "u1"]) + def test_add_gt_field_gt_unsupported_dtype(self, dtype): + encoder = example_encoder(1, 1) + with pytest.raises(ValueError, match="unsupported array dtype"): + encoder.add_gt_field( + np.zeros((1, 1, 1), dtype=dtype), np.zeros((1, 1), dtype=bool) + ) + + @pytest.mark.parametrize("dtype", ["m", "c16", "O", "u1"]) + def test_add_gt_field_gt_phased_unsupported_dtype(self, dtype): + encoder = example_encoder(1, 1) + with pytest.raises(ValueError, match="Wrong dtype for gt_phased"): + encoder.add_gt_field( + np.zeros((1, 1, 1), dtype=np.int8), np.zeros((1, 1), dtype=dtype) + ) + @pytest.mark.parametrize("dtype", ["i8", "f8"]) def test_add_info_field_unsupported_width(self, dtype): encoder = example_encoder(1) @@ -261,6 +323,13 @@ def test_add_format_field_unsupported_width(self, dtype): with pytest.raises(ValueError, match="-203"): encoder.add_format_field("name", np.zeros((1, 1, 1), dtype=dtype)) + def test_add_gt_field_unsupported_width(self): + encoder = example_encoder(1, 1) + with pytest.raises(ValueError, match="-203"): + encoder.add_gt_field( + np.zeros((1, 1, 1), dtype=np.int64), np.zeros((1, 1), dtype=bool) + ) + class TestArrays: def test_stored_data_equal(self): diff --git a/vcztools/_vcztoolsmodule.c b/vcztools/_vcztoolsmodule.c index de4f18f..3130628 100644 --- a/vcztools/_vcztoolsmodule.c +++ b/vcztools/_vcztoolsmodule.c @@ -403,17 +403,25 @@ VcfEncoder_add_gt_field(VcfEncoder *self, PyObject *args) if (!PyArg_ParseTuple(args, "O!O!", &PyArray_Type, >, &PyArray_Type, >_phased)) { goto out; } - if (VcfEncoder_add_field_array(self, "GT", gt, 3, "FORMAT/", true) != 0) { + if (VcfEncoder_add_field_array(self, "gt", gt, 3, "FORMAT/", true) != 0) { goto out; } - if (VcfEncoder_add_field_array(self, "GT_phased", gt_phased, 2, "FORMAT/", true) + if (VcfEncoder_add_field_array(self, "gt_phased", gt_phased, 2, "FORMAT/", true) != 0) { goto out; } - // CHECK TYPES + if (PyArray_DTYPE(gt)->kind != 'i') { + PyErr_Format( + PyExc_ValueError, "Array 'gt' has unsupported array dtype"); + goto out; + } + if (check_dtype("gt_phased", gt_phased, NPY_BOOL) != 0) { + goto out; + } err = vcz_variant_encoder_add_gt_field(self->vcf_encoder, PyArray_ITEMSIZE(gt), PyArray_DIMS(gt)[2], PyArray_DATA(gt), PyArray_DATA(gt_phased)); if (err != 0) { + handle_library_error(err); goto out; } From 01e69ae9f335e26f665656178dc20ff2b174cd71 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 16 Jul 2024 13:28:09 +0100 Subject: [PATCH 06/11] Tighten up GT field testing --- tests/test_cpython_interface.py | 25 ++++++++++++++++++++++++- vcztools/_vcztoolsmodule.c | 6 +++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/tests/test_cpython_interface.py b/tests/test_cpython_interface.py index a2c5bf6..533c64c 100644 --- a/tests/test_cpython_interface.py +++ b/tests/test_cpython_interface.py @@ -47,6 +47,17 @@ def example_format_data(num_variants, num_samples): return d +def example_gt_data(num_variants, num_samples, ploidy=2): + return { + "gt": np.arange(num_variants * num_samples * ploidy) + .astype(np.int32) + .reshape(num_variants, num_samples, ploidy), + "gt_phased": (np.arange(num_variants * num_samples) % 2) + .reshape(num_variants, num_samples) + .astype(bool), + } + + def example_encoder(num_variants=1, num_samples=0, add_info=True): encoder = _vcztools.VcfEncoder( num_variants, num_samples, **example_fixed_data(num_variants, num_samples) @@ -57,6 +68,9 @@ def example_encoder(num_variants=1, num_samples=0, add_info=True): if num_samples > 0: for name, data in example_format_data(num_variants, num_samples).items(): encoder.add_format_field(name, data) + + gt_data = example_gt_data(num_variants, num_samples) + encoder.add_gt_field(gt_data["gt"], gt_data["gt_phased"]) # import sys # encoder.print_state(sys.stdout) return encoder @@ -330,13 +344,22 @@ def test_add_gt_field_unsupported_width(self): np.zeros((1, 1, 1), dtype=np.int64), np.zeros((1, 1), dtype=bool) ) + def test_add_gt_field_zero_ploidy(self): + encoder = example_encoder(1, 1) + with pytest.raises(ValueError, match="-204"): + encoder.add_gt_field( + np.zeros((1, 1, 0), dtype=np.int64), np.zeros((1, 1), dtype=bool) + ) + class TestArrays: def test_stored_data_equal(self): num_variants = 20 num_samples = 10 fixed_data = example_fixed_data(num_variants, num_samples) + gt_data = example_gt_data(num_variants, num_samples) encoder = _vcztools.VcfEncoder(num_variants, num_samples, **fixed_data) + encoder.add_gt_field(gt_data["gt"], gt_data["gt_phased"]) info_data = example_info_data(num_variants) format_data = example_format_data(num_variants, num_samples) for name, data in info_data.items(): @@ -346,7 +369,7 @@ def test_stored_data_equal(self): all_data = {**fixed_data} for name, array in info_data.items(): all_data[f"INFO/{name}"] = array - for name, array in format_data.items(): + for name, array in {**format_data, **gt_data}.items(): all_data[f"FORMAT/{name}"] = array encoder_arrays = encoder.arrays assert set(encoder.arrays.keys()) == set(all_data.keys()) diff --git a/vcztools/_vcztoolsmodule.c b/vcztools/_vcztoolsmodule.c index 3130628..fa09679 100644 --- a/vcztools/_vcztoolsmodule.c +++ b/vcztools/_vcztoolsmodule.c @@ -174,7 +174,8 @@ np_type_to_vcz_type(const char *name, PyArrayObject *array) } static int -check_dtype(const char *name, PyArrayObject *array, int type) { +check_dtype(const char *name, PyArrayObject *array, int type) +{ if (PyArray_DTYPE(array)->type_num != type) { PyErr_Format(PyExc_ValueError, "Wrong dtype for %s", name); return -1; @@ -411,8 +412,7 @@ VcfEncoder_add_gt_field(VcfEncoder *self, PyObject *args) goto out; } if (PyArray_DTYPE(gt)->kind != 'i') { - PyErr_Format( - PyExc_ValueError, "Array 'gt' has unsupported array dtype"); + PyErr_Format(PyExc_ValueError, "Array 'gt' has unsupported array dtype"); goto out; } if (check_dtype("gt_phased", gt_phased, NPY_BOOL) != 0) { From d727da601491fbef5cc818fb82027e16b4681584 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 16 Jul 2024 13:51:00 +0100 Subject: [PATCH 07/11] Tests for encode --- lib/tests.c | 10 +++++++--- lib/vcf_encoder.c | 7 ++++++- lib/vcf_encoder.h | 18 ++---------------- tests/test_cpython_interface.py | 31 ++++++++++++++++++++++++++++--- vcztools/_vcztoolsmodule.c | 6 +++++- 5 files changed, 48 insertions(+), 24 deletions(-) diff --git a/lib/tests.c b/lib/tests.c index dcc2d5b..bb0f249 100644 --- a/lib/tests.c +++ b/lib/tests.c @@ -57,14 +57,14 @@ validate_encoder( for (buflen = 0; buflen < min_len; buflen++) { buf = malloc((size_t) buflen); CU_ASSERT_FATAL(buf != NULL); - ret = vcz_variant_encoder_write_row(encoder, j, buf, (size_t) buflen); + ret = vcz_variant_encoder_encode(encoder, j, buf, (size_t) buflen); free(buf); CU_ASSERT_FATAL(ret == VCZ_ERR_BUFFER_OVERFLOW); } buflen = min_len; buf = malloc((size_t) buflen); CU_ASSERT_FATAL(buf != NULL); - ret = vcz_variant_encoder_write_row(encoder, j, buf, (size_t) buflen); + ret = vcz_variant_encoder_encode(encoder, j, buf, (size_t) buflen); /* printf("ret = %d\n", (int) ret); */ /* printf("GOT:'%s'\n", buf); */ /* printf("EXP:'%s'\n", expected[j]); */ @@ -82,6 +82,10 @@ validate_encoder( CU_ASSERT_NSTRING_EQUAL_FATAL(buf, expected[j], ret); free(buf); } + for (j = num_rows; j < num_rows + 5; j++) { + ret = vcz_variant_encoder_encode(encoder, j, buf, 0); + CU_ASSERT_EQUAL_FATAL(ret, VCZ_ERR_VARIANT_OUT_OF_BOUNDS); + } } static void @@ -294,7 +298,7 @@ static void test_float_field_2d(void) { // clang-format off - int32_t data[] = { + int32_t data[] = { VCZ_FLOAT32_MISSING_AS_INT32, VCZ_FLOAT32_MISSING_AS_INT32, VCZ_FLOAT32_MISSING_AS_INT32, VCZ_FLOAT32_MISSING_AS_INT32, VCZ_FLOAT32_MISSING_AS_INT32, VCZ_FLOAT32_FILL_AS_INT32, VCZ_FLOAT32_MISSING_AS_INT32, VCZ_FLOAT32_FILL_AS_INT32, VCZ_FLOAT32_FILL_AS_INT32, diff --git a/lib/vcf_encoder.c b/lib/vcf_encoder.c index 9652436..9cf3c08 100644 --- a/lib/vcf_encoder.c +++ b/lib/vcf_encoder.c @@ -775,12 +775,17 @@ vcz_variant_encoder_write_filter(const vcz_variant_encoder_t *self, size_t varia } int64_t -vcz_variant_encoder_write_row( +vcz_variant_encoder_encode( const vcz_variant_encoder_t *self, size_t variant, char *buf, size_t buflen) { int64_t offset = 0; size_t j; + if (variant >= self->num_variants) { + offset = VCZ_ERR_VARIANT_OUT_OF_BOUNDS; + goto out; + } + for (j = 0; j < VCZ_NUM_FIXED_FIELDS; j++) { if (vcz_field_is_missing_1d(&self->fixed_fields[j], variant)) { offset = append_char(buf, '.', offset, (int64_t) buflen); diff --git a/lib/vcf_encoder.h b/lib/vcf_encoder.h index be29b85..36dc7d2 100644 --- a/lib/vcf_encoder.h +++ b/lib/vcf_encoder.h @@ -37,6 +37,7 @@ #define VCZ_ERR_NO_MEMORY (-100) #define VCZ_ERR_BUFFER_OVERFLOW (-101) +#define VCZ_ERR_VARIANT_OUT_OF_BOUNDS (-102) /* Built-in-limitations */ #define VCZ_ERR_FIELD_NAME_TOO_LONG (-201) @@ -77,21 +78,6 @@ typedef struct { vcz_field_t *format_fields; } vcz_variant_encoder_t; -/* // clang-format off */ -/* int vcz_variant_encoder_init(vcz_variant_encoder_t *self, */ -/* size_t num_variants, size_t num_samples, */ -/* const char *contig_data, size_t contig_item_size, */ -/* const int32_t *position_data, */ -/* const char *id_data, size_t id_item_size, size_t id_num_columns, */ -/* const char *ref_data, size_t ref_item_size, */ -/* const char *alt_data, size_t alt_item_size, size_t alt_num_columns, */ -/* const float *qual_data, */ -/* const char *filter_id_data, size_t filter_id_item_size, size_t - * filter_id_num_columns, */ -/* const int8_t *filter_data */ -/* ); */ -/* // clang-format on */ - int vcz_variant_encoder_init( vcz_variant_encoder_t *self, size_t num_variants, size_t num_samples); void vcz_variant_encoder_free(vcz_variant_encoder_t *self); @@ -117,7 +103,7 @@ int vcz_variant_encoder_add_info_field(vcz_variant_encoder_t *self, const char * int vcz_variant_encoder_add_format_field(vcz_variant_encoder_t *self, const char *name, int type, size_t item_size, size_t num_columns, const void *data); -int64_t vcz_variant_encoder_write_row( +int64_t vcz_variant_encoder_encode( const vcz_variant_encoder_t *self, size_t row, char *buf, size_t buflen); int vcz_itoa(char *buf, int64_t v); diff --git a/tests/test_cpython_interface.py b/tests/test_cpython_interface.py index 533c64c..c19d23c 100644 --- a/tests/test_cpython_interface.py +++ b/tests/test_cpython_interface.py @@ -95,6 +95,11 @@ def test_read_only_file(self, tmp_path): with pytest.raises(OSError, match="22"): encoder.print_state(f) + def test_no_arg(self): + encoder = example_encoder() + with pytest.raises(TypeError, match="function takes"): + encoder.print_state() + @pytest.mark.parametrize("fileobj", [None, "path"]) def test_bad_file_arg(self, fileobj): encoder = example_encoder() @@ -108,13 +113,33 @@ def test_bad_fileno(self, fileobj): encoder.print_state(fileobj) -class TestEncodeOverrun: - def test_fixed_fields(self): +class TestEncode: + @pytest.mark.parametrize("bad_arg", [None, {}, "0"]) + def test_bad_index_arg(self, bad_arg): + encoder = example_encoder() + with pytest.raises(TypeError, match="integer"): + encoder.encode(bad_arg, 1) + + @pytest.mark.parametrize("bad_arg", [None, {}, "0"]) + def test_bad_len_arg(self, bad_arg): encoder = example_encoder() + with pytest.raises(TypeError, match="integer"): + encoder.encode(0, bad_arg) + + @pytest.mark.parametrize("bad_row", [2, 3, 100, -1, 2**31 - 1]) + def test_bad_variant_arg(self, bad_row): + encoder = example_encoder(2) + with pytest.raises(ValueError, match="102"): + encoder.encode(bad_row, 1) + + def test_small_example_overrun(self): + encoder = example_encoder(1, 2) s = encoder.encode(0, 1024) - for length in range(len(s)): + minlen = len(s) + for length in range(minlen): with pytest.raises(ValueError, match="-101"): encoder.encode(0, length) + assert s == encoder.encode(0, minlen) class TestFixedFieldInputChecking: diff --git a/vcztools/_vcztoolsmodule.c b/vcztools/_vcztoolsmodule.c index fa09679..c11b49d 100644 --- a/vcztools/_vcztoolsmodule.c +++ b/vcztools/_vcztoolsmodule.c @@ -434,6 +434,7 @@ static PyObject * VcfEncoder_encode(VcfEncoder *self, PyObject *args) { PyObject *ret = NULL; + // FIXME these shouldn't be ints int row; int bufsize; char *buf = NULL; @@ -445,11 +446,14 @@ VcfEncoder_encode(VcfEncoder *self, PyObject *args) if (!PyArg_ParseTuple(args, "ii", &row, &bufsize)) { goto out; } + /* Interpret bufsize as the length of the Python string, so add one to + * allow for the NULL byte */ + bufsize++; buf = PyMem_Malloc(bufsize); if (buf == NULL) { goto out; } - line_length = vcz_variant_encoder_write_row(self->vcf_encoder, row, buf, bufsize); + line_length = vcz_variant_encoder_encode(self->vcf_encoder, row, buf, bufsize); if (line_length < 0) { handle_library_error((int) line_length); goto out; From f0fe0fbe5afbaa7e441ca50158ca898f56f1691f Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 16 Jul 2024 15:24:20 +0100 Subject: [PATCH 08/11] Cover a few more corner cases --- tests/test_cpython_interface.py | 19 ++++++++++++++++--- vcztools/_vcztoolsmodule.c | 13 +++++++------ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/tests/test_cpython_interface.py b/tests/test_cpython_interface.py index c19d23c..12ae4d4 100644 --- a/tests/test_cpython_interface.py +++ b/tests/test_cpython_interface.py @@ -117,21 +117,26 @@ class TestEncode: @pytest.mark.parametrize("bad_arg", [None, {}, "0"]) def test_bad_index_arg(self, bad_arg): encoder = example_encoder() - with pytest.raises(TypeError, match="integer"): + with pytest.raises(TypeError, match="int"): encoder.encode(bad_arg, 1) @pytest.mark.parametrize("bad_arg", [None, {}, "0"]) def test_bad_len_arg(self, bad_arg): encoder = example_encoder() - with pytest.raises(TypeError, match="integer"): + with pytest.raises(TypeError, match="int"): encoder.encode(0, bad_arg) - @pytest.mark.parametrize("bad_row", [2, 3, 100, -1, 2**31 - 1]) + @pytest.mark.parametrize("bad_row", [2, 3, 100, -1, 2**32, 2**63]) def test_bad_variant_arg(self, bad_row): encoder = example_encoder(2) with pytest.raises(ValueError, match="102"): encoder.encode(bad_row, 1) + def test_buffer_malloc_fail(self): + encoder = example_encoder(1) + with pytest.raises(MemoryError): + encoder.encode(0, 2**63) + def test_small_example_overrun(self): encoder = example_encoder(1, 2) s = encoder.encode(0, 1024) @@ -159,6 +164,14 @@ def test_field_incorrect_dtype(self, name): with pytest.raises(ValueError, match=f"Wrong dtype for {name}"): _vcztools.VcfEncoder(num_variants, 0, **data) + @pytest.mark.parametrize("name", FIXED_FIELD_NAMES + ["filter_ids"]) + def test_field_incorrect_dimension(self, name): + num_variants = 2 + data = example_fixed_data(num_variants) + data[name] = np.expand_dims(data[name], -1) + with pytest.raises(ValueError, match=f"{name} has wrong dimension"): + _vcztools.VcfEncoder(num_variants, 0, **data) + @pytest.mark.parametrize("name", FIXED_FIELD_NAMES + ["filter_ids"]) def test_field_incorrect_type(self, name): num_variants = 5 diff --git a/vcztools/_vcztoolsmodule.c b/vcztools/_vcztoolsmodule.c index c11b49d..3de4463 100644 --- a/vcztools/_vcztoolsmodule.c +++ b/vcztools/_vcztoolsmodule.c @@ -434,16 +434,15 @@ static PyObject * VcfEncoder_encode(VcfEncoder *self, PyObject *args) { PyObject *ret = NULL; - // FIXME these shouldn't be ints - int row; - int bufsize; + unsigned long long row; + unsigned long long bufsize; char *buf = NULL; int64_t line_length; if (VcfEncoder_check_state(self) != 0) { goto out; } - if (!PyArg_ParseTuple(args, "ii", &row, &bufsize)) { + if (!PyArg_ParseTuple(args, "KK", &row, &bufsize)) { goto out; } /* Interpret bufsize as the length of the Python string, so add one to @@ -451,14 +450,16 @@ VcfEncoder_encode(VcfEncoder *self, PyObject *args) bufsize++; buf = PyMem_Malloc(bufsize); if (buf == NULL) { + PyErr_NoMemory(); goto out; } - line_length = vcz_variant_encoder_encode(self->vcf_encoder, row, buf, bufsize); + line_length = vcz_variant_encoder_encode( + self->vcf_encoder, (size_t) row, buf, (size_t) bufsize); if (line_length < 0) { handle_library_error((int) line_length); goto out; } - ret = Py_BuildValue("s#", buf, line_length); + ret = Py_BuildValue("s#", buf, (Py_ssize_t) line_length); out: PyMem_Free(buf); return ret; From aa06619c79ec8bf6a929baafd61e3a005a272a74 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 16 Jul 2024 15:31:34 +0100 Subject: [PATCH 09/11] A govr excludes to the python C API code --- vcztools/_vcztoolsmodule.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vcztools/_vcztoolsmodule.c b/vcztools/_vcztoolsmodule.c index 3de4463..5cc8378 100644 --- a/vcztools/_vcztoolsmodule.c +++ b/vcztools/_vcztoolsmodule.c @@ -89,7 +89,7 @@ VcfEncoder_add_array( PyObject *key = PyUnicode_FromFormat("%s%s", prefix, name); if (array == NULL || key == NULL) { - goto out; + goto out; // GCOVR_EXCL_LINE } ret = PyDict_SetItem(self->arrays, key, (PyObject *) array); out: @@ -223,8 +223,8 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) self->vcf_encoder = PyMem_Calloc(1, sizeof(*self->vcf_encoder)); self->arrays = PyDict_New(); if (self->vcf_encoder == NULL || self->arrays == NULL) { - PyErr_NoMemory(); - goto out; + PyErr_NoMemory(); // GCOVR_EXCL_LINE + goto out; // GCOVR_EXCL_LINE } if (!PyArg_ParseTupleAndKeywords(args, kwds, "iiO!O!O!O!O!O!O!O!", kwlist, @@ -234,10 +234,12 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) goto out; } + // This function currently cannot fail as there's no memory allocation, but + // we keep the check in case this changes in later versions. err = vcz_variant_encoder_init(self->vcf_encoder, num_variants, num_samples); if (err < 0) { - handle_library_error(err); - goto out; + handle_library_error(err); // GCOVR_EXCL_LINE + goto out; // GCOVR_EXCL_LINE } if (VcfEncoder_store_fixed_array(self, chrom, "chrom", NPY_STRING, 1) != 0) { @@ -269,16 +271,14 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) vcz_variant_encoder_add_chrom_field( self->vcf_encoder, PyArray_ITEMSIZE(chrom), PyArray_DATA(chrom)); if (err < 0) { - // TODO ADD NOCOVER - handle_library_error(err); - goto out; + handle_library_error(err); // GCOVR_EXCL_LINE + goto out; // GCOVR_EXCL_LINE } vcz_variant_encoder_add_ref_field( self->vcf_encoder, PyArray_ITEMSIZE(ref), PyArray_DATA(ref)); if (err < 0) { - // TODO ADD NOCOVER - handle_library_error(err); - goto out; + handle_library_error(err); // GCOVR_EXCL_LINE + goto out; // GCOVR_EXCL_LINE } /* Note: the only provokable error here is the zero-sized second dimension, * which we could catch easily above */ @@ -305,7 +305,7 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) } num_filters = PyArray_DIMS(filter_ids)[0]; if (VcfEncoder_add_array(self, "", "filter_ids", filter_ids) != 0) { - goto out; + goto out; // GCOVR_EXCL_LINE } if (VcfEncoder_store_fixed_array(self, filter, "filter", NPY_BOOL, 2) != 0) { goto out; From 2a16c6f9af0bd27b0c8b3dbd7e389e346c5b1c81 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 16 Jul 2024 15:35:00 +0100 Subject: [PATCH 10/11] Add gcovr skips to memory allocation fails --- lib/vcf_encoder.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/vcf_encoder.c b/lib/vcf_encoder.c index 9cf3c08..b155f9a 100644 --- a/lib/vcf_encoder.c +++ b/lib/vcf_encoder.c @@ -577,8 +577,8 @@ vcz_variant_encoder_write_info_fields(const vcz_variant_encoder_t *self, size_t if (self->num_info_fields > 0) { missing = malloc(self->num_info_fields * sizeof(*missing)); if (missing == NULL) { - offset = VCZ_ERR_NO_MEMORY; - goto out; + offset = VCZ_ERR_NO_MEMORY; // GCOVR_EXCL_LINE + goto out; // GCOVR_EXCL_LINE } for (j = 0; j < self->num_info_fields; j++) { missing[j] = vcz_field_is_missing_1d(&self->info_fields[j], variant); @@ -648,8 +648,8 @@ vcz_variant_encoder_write_format_fields(const vcz_variant_encoder_t *self, if (self->num_format_fields > 0) { missing = malloc(self->num_format_fields * sizeof(*missing)); if (missing == NULL) { - offset = VCZ_ERR_NO_MEMORY; - goto out; + offset = VCZ_ERR_NO_MEMORY; // GCOVR_EXCL_LINE + goto out; // GCOVR_EXCL_LINE } for (j = 0; j < self->num_format_fields; j++) { missing[j] @@ -863,8 +863,8 @@ vcz_variant_encoder_add_info_field(vcz_variant_encoder_t *self, const char *name tmp = realloc( self->info_fields, self->max_info_fields * sizeof(*self->info_fields)); if (tmp == NULL) { - ret = VCZ_ERR_NO_MEMORY; - goto out; + ret = VCZ_ERR_NO_MEMORY; // GCOVR_EXCL_LINE + goto out; // GCOVR_EXCL_LINE } self->info_fields = tmp; } @@ -891,8 +891,8 @@ vcz_variant_encoder_add_format_field(vcz_variant_encoder_t *self, const char *na tmp = realloc( self->format_fields, self->max_format_fields * sizeof(*self->format_fields)); if (tmp == NULL) { - ret = VCZ_ERR_NO_MEMORY; - goto out; + ret = VCZ_ERR_NO_MEMORY; // GCOVR_EXCL_LINE + goto out; // GCOVR_EXCL_LINE } self->format_fields = tmp; } From 020c2fd6dd8a6b6bcbc5f80ff6c0ec15410f5789 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 16 Jul 2024 15:54:50 +0100 Subject: [PATCH 11/11] Add retry logic for increasing buffer size --- tests/test_cpython_interface.py | 12 +++++++++++- vcztools/_vcztoolsmodule.c | 17 +++++++++++++++-- vcztools/vcf_writer.py | 13 +++++++++++-- 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/tests/test_cpython_interface.py b/tests/test_cpython_interface.py index 12ae4d4..3210479 100644 --- a/tests/test_cpython_interface.py +++ b/tests/test_cpython_interface.py @@ -142,10 +142,20 @@ def test_small_example_overrun(self): s = encoder.encode(0, 1024) minlen = len(s) for length in range(minlen): - with pytest.raises(ValueError, match="-101"): + with pytest.raises(_vcztools.VczBufferTooSmall, match="-101"): encoder.encode(0, length) assert s == encoder.encode(0, minlen) + @pytest.mark.parametrize("variant", [0, 999, 333, 501]) + def test_large_example_overrun(self, variant): + encoder = example_encoder(1000, 100) + s = encoder.encode(variant, 1024 * 1024) + minlen = len(s) + for length in range(minlen): + with pytest.raises(_vcztools.VczBufferTooSmall, match="-101"): + encoder.encode(variant, length) + assert s == encoder.encode(variant, minlen) + class TestFixedFieldInputChecking: @pytest.mark.parametrize("name", FIXED_FIELD_NAMES) diff --git a/vcztools/_vcztoolsmodule.c b/vcztools/_vcztoolsmodule.c index 5cc8378..0e80f7f 100644 --- a/vcztools/_vcztoolsmodule.c +++ b/vcztools/_vcztoolsmodule.c @@ -17,11 +17,20 @@ typedef struct { } VcfEncoder; // clang-format on +static PyObject *VczBufferTooSmall; + static void handle_library_error(int err) { - // TODO generate string messages. - PyErr_Format(PyExc_ValueError, "Error occured: %d: ", err); + switch (err) { + case VCZ_ERR_BUFFER_OVERFLOW: + PyErr_Format( + VczBufferTooSmall, "Error: %d; specified buffer size is too small", err); + break; + // TODO handle the other error types. + default: + PyErr_Format(PyExc_ValueError, "Error occured: %d: ", err); + } } static FILE * @@ -576,6 +585,10 @@ PyInit__vcztools(void) /* Initialise numpy */ import_array(); + VczBufferTooSmall = PyErr_NewException("_vcztools.VczBufferTooSmall", NULL, NULL); + Py_INCREF(VczBufferTooSmall); + PyModule_AddObject(module, "VczBufferTooSmall", VczBufferTooSmall); + /* VcfEncoder type */ if (PyType_Ready(&VcfEncoderType) < 0) { return NULL; diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index b383df1..54fe78e 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -258,9 +258,18 @@ def c_chunk_to_vcf(root, v_chunk, contigs, filters, output): if len(array.shape) == 2: array = array.reshape((num_variants, num_samples, 1)) encoder.add_format_field(name, array) - + # TODO: (1) make a guess at this based on number of fields and samples, + # and (2) log a DEBUG message when we have to double. + buflen = 1024 for j in range(num_variants): - line = encoder.encode(j, 2**20) + failed = True + while failed: + try: + line = encoder.encode(j, buflen) + failed = False + except _vcztools.VczBufferTooSmall: + buflen *= 2 + # print("Bumping buflen to", buflen) print(line, file=output)