diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3e91270..ddce92a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,6 +36,8 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install '.[dev]' + # Build the extension module in-place so pytest can find it + python3 setup.py build_ext --inplace - name: Run tests run: | pytest @@ -61,4 +63,6 @@ jobs: - name: Check vcztools CLI run: | vcztools --help + # Make sure we don't have ``vcztools`` in the CWD + cd tests python -m vcztools --help diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..26f5eea --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include lib/*.h diff --git a/Makefile b/Makefile index 1fac31e..3583e92 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,10 @@ all: ext -ext: _vcztoolsmodule.c +ext: vcztools/_vcztoolsmodule.c CFLAGS="-std=c99 -Wall -Wextra -Werror -Wno-unused-parameter -Wno-cast-function-type" \ python3 setup.py build_ext --inplace clean: - rm -f *.so *.o tags + rm -f vcztools/*.so rm -fR build diff --git a/lib/tests.c b/lib/tests.c index 4f22ba2..27437d0 100644 --- a/lib/tests.c +++ b/lib/tests.c @@ -118,24 +118,27 @@ test_variant_encoder_minimal(void) const int32_t pos_data[] = { 123, 45678 }; const char id_data[] = "RS1RS2"; const char ref_data[] = "AG"; - const char alt_data[] = "TC"; - const int32_t qual_data[] = { 1000, 12 }; - const char filter_data[] = "PASSPASS"; + const char alt_data[] = "T"; + const float qual_data[] = { 9, 12.1 }; + const char filter_id_data[] = "PASS\0FILT1"; + const int8_t filter_data[] = {1, 0, 0, 1}; const int32_t an_data[] = { -1, 9 }; const char* aa_data = "G."; + const int8_t flag_data[] = {0, 1}; const int32_t gt_data[] = { 0, 0, 0, 1, 1, 1, 1, 0 }; const int8_t gt_phased_data[] = { 0, 1, 1, 0 }; const int32_t hq_data[] = { 10, 15, 7, 12, -1, -1, -1, -1}; int ret, j; vcz_variant_encoder_t writer; const char *expected[] = { - "X\t123\tRS1\tA\tT\t1000\tPASS\tAA=G\tGT:HQ\t0/0:10,15\t0|1:7,12", - "YY\t45678\tRS2\tG\tC\t12\tPASS\tAN=9\tGT\t1|1\t1/0", + "X\t123\tRS1\tA\tT\t9\tPASS\tAA=G\tGT:HQ\t0/0:10,15\t0|1:7,12", + "YY\t45678\tRS2\tG\t.\t12.1\tFILT1\tAN=9;FLAG\tGT\t1|1\t1/0", }; char buf[1000]; ret = vcz_variant_encoder_init(&writer, 2, 2, contig_data, 2, pos_data, id_data, 3, - 1, ref_data, 1, alt_data, 1, 1, qual_data, filter_data, 4, 1); + 1, ref_data, 1, alt_data, 1, 1, qual_data, + filter_id_data, 5, 2, filter_data); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = vcz_variant_encoder_add_gt_field(&writer, gt_data, 4, 2, gt_phased_data); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -143,20 +146,23 @@ test_variant_encoder_minimal(void) CU_ASSERT_EQUAL_FATAL(ret, 0); ret = vcz_variant_encoder_add_info_field(&writer, "AA", VCZ_TYPE_STRING, 1, 1, aa_data); CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = vcz_variant_encoder_add_info_field(&writer, "FLAG", VCZ_TYPE_BOOL, 1, 1, flag_data); + CU_ASSERT_EQUAL_FATAL(ret, 0); ret = vcz_variant_encoder_add_format_field( &writer, "HQ", VCZ_TYPE_INT, 4, 2, hq_data); CU_ASSERT_EQUAL_FATAL(ret, 0); - printf("\n"); vcz_variant_encoder_print_state(&writer, _devnull); + printf("\n"); /* vcz_variant_encoder_print_state(&writer, stdout); */ for (j = 0; j < num_rows; j++) { ret = vcz_variant_encoder_write_row(&writer, j, buf, 1000); + /* printf("ret = %d\n", ret); */ printf("GOT:%s\n", buf); printf("EXP:%s\n", expected[j]); - printf("GOT:%d\n", (int) strlen(buf)); - printf("EXP:%d\n", (int) strlen(expected[j])); + /* printf("GOT:%d\n", (int) strlen(buf)); */ + /* printf("EXP:%d\n", (int) strlen(expected[j])); */ CU_ASSERT_EQUAL(ret, strlen(expected[j])); CU_ASSERT_STRING_EQUAL(buf, expected[j]); } diff --git a/lib/vcf_encoder.c b/lib/vcf_encoder.c index 6fc15dc..97aaf22 100644 --- a/lib/vcf_encoder.c +++ b/lib/vcf_encoder.c @@ -55,6 +55,13 @@ vcz_itoa(int32_t value, char *buf) return p; } +static bool +bool_all_missing(const int8_t *restrict data, size_t n) +{ + assert(n == 1); + return !data[0]; +} + static bool int32_all_missing(const int32_t *restrict data, size_t n) { @@ -68,6 +75,20 @@ int32_all_missing(const int32_t *restrict data, size_t n) return true; } +static bool +float32_all_missing(const float *restrict data, size_t n) +{ + size_t j; + const int32_t *restrict di32 = (const int32_t *) data; + + for (j = 0; j < n; j++) { + if (di32[j] != VCZ_FLOAT32_FILL_AS_INT32 + && di32[j] != VCZ_FLOAT32_MISSING_AS_INT32) { + return false; + } + } + return true; +} static bool string_all_missing(const char *restrict data, size_t item_size, size_t n) { @@ -109,6 +130,15 @@ string_field_write_entry( return offset; } +static int64_t +bool_field_write_entry( + const vcz_field_t *self, const void *data, char *dest, size_t buflen, int64_t offset) +{ + /* Erase the previously inserted "=" we need for other fields.*/ + dest[offset - 1] = '\t'; + return offset; +} + static int64_t int32_field_write_entry( const vcz_field_t *self, const void *data, char *dest, size_t buflen, int64_t offset) @@ -144,6 +174,44 @@ int32_field_write_entry( return offset; } +static int64_t +float32_field_write_entry( + const vcz_field_t *self, const void *data, char *dest, size_t buflen, int64_t offset) +{ + const float *source = (float *) data; + const int32_t *int32_source = (int32_t *) data; + int32_t int32_value; + float value; + size_t column; + int written; + char value_buffer[128]; + + for (column = 0; column < self->num_columns; column++) { + int32_value = int32_source[column]; + if (int32_value == VCZ_FLOAT32_FILL_AS_INT32) { + break; + } + if (column > 0) { + dest[offset] = ','; + offset++; + } + if (int32_value == VCZ_FLOAT32_MISSING_AS_INT32) { + dest[offset] = '.'; + offset++; + } else { + /* offset += vcz_itoa(value, dest + offset); */ + value = source[column]; + written = snprintf(value_buffer, sizeof(value_buffer), "%.3g", value); + memcpy(dest + offset, value_buffer, written); + offset += written; + } + } + dest[offset] = '\t'; + offset++; + dest[offset] = '\0'; + return offset; +} + int64_t vcz_field_write_entry( const vcz_field_t *self, const void *data, char *dest, size_t buflen, int64_t offset) @@ -153,6 +221,13 @@ vcz_field_write_entry( if (self->item_size == 4) { return int32_field_write_entry(self, data, dest, buflen, offset); } + } else if (self->type == VCZ_TYPE_FLOAT) { + assert(self->item_size == 4); + return float32_field_write_entry(self, data, dest, buflen, offset); + } else if (self->type == VCZ_TYPE_BOOL) { + assert(self->item_size == 1); + assert(self->num_columns == 1); + return bool_field_write_entry(self, data, dest, buflen, offset); } else if (self->type == VCZ_TYPE_STRING) { return string_field_write_entry(self, data, dest, buflen, offset); } @@ -181,6 +256,10 @@ vcz_info_field_is_missing(const vcz_field_t *self, size_t variant) if (self->item_size == 4) { return int32_all_missing(data, self->num_columns); } + } else if (self->type == VCZ_TYPE_FLOAT) { + return float32_all_missing(data, self->num_columns); + } else if (self->type == VCZ_TYPE_BOOL) { + return bool_all_missing(data, self->num_columns); } else if (self->type == VCZ_TYPE_STRING) { return string_all_missing(data, self->item_size, self->num_columns); } @@ -199,7 +278,8 @@ vcz_format_field_is_missing(const vcz_field_t *self, size_t variant, size_t num_ return int32_all_missing(data, self->num_columns * num_samples); } } else if (self->type == VCZ_TYPE_STRING) { - return string_all_missing(data, self->item_size, self->num_columns * num_samples); + return string_all_missing( + data, self->item_size, self->num_columns * num_samples); } assert(false); return false; @@ -255,37 +335,6 @@ vcz_variant_encoder_write_sample_gt(const vcz_variant_encoder_t *self, size_t va return offset; } -/* int64_t */ -/* vcz_variant_encoder_write_format_fields(const vcz_variant_encoder_t *self, */ -/* size_t variant, size_t sample, char *dest, size_t buflen, int64_t offset) */ -/* { */ -/* vcz_field_t field; */ -/* size_t j, row_size; */ -/* const void *data; */ - -/* if (self->gt.data != NULL) { */ -/* offset = vcz_variant_encoder_write_sample_gt( */ -/* self, variant, sample, dest, buflen, offset); */ -/* if (offset < 0) { */ -/* goto out; */ -/* } */ -/* } */ - -/* for (j = 0; j < self->num_format_fields; j++) { */ -/* field = self->format_fields[j]; */ -/* dest[offset - 1] = ':'; */ -/* row_size = self->num_samples * field.num_columns * field.item_size; */ -/* data = field.data + variant * row_size */ -/* + sample * field.num_columns * field.item_size; */ -/* offset = vcz_field_write_entry(&field, data, dest, buflen, offset); */ -/* if (offset < 0) { */ -/* goto out; */ -/* } */ -/* } */ -/* out: */ -/* return offset; */ -/* } */ - int64_t vcz_variant_encoder_write_info_fields(const vcz_variant_encoder_t *self, size_t variant, char *dest, size_t buflen, int64_t offset) @@ -342,10 +391,9 @@ vcz_variant_encoder_write_info_fields(const vcz_variant_encoder_t *self, size_t return offset; } - static int64_t -vcz_variant_encoder_write_format_fields( - const vcz_variant_encoder_t *self, size_t variant, char *buf, size_t buflen, int64_t offset) +vcz_variant_encoder_write_format_fields(const vcz_variant_encoder_t *self, + size_t variant, char *buf, size_t buflen, int64_t offset) { size_t j, sample, row_size; vcz_field_t field; @@ -367,7 +415,8 @@ vcz_variant_encoder_write_format_fields( goto out; } for (j = 0; j < self->num_format_fields; j++) { - missing[j] = vcz_format_field_is_missing(&self->format_fields[j], variant, num_samples); + missing[j] = vcz_format_field_is_missing( + &self->format_fields[j], variant, num_samples); if (!missing[j]) { all_missing = false; } @@ -375,7 +424,7 @@ vcz_variant_encoder_write_format_fields( } all_missing = all_missing && gt_missing; - if (! all_missing) { + if (!all_missing) { if (!gt_missing) { strcpy(buf + offset, "GT:"); @@ -423,6 +472,45 @@ vcz_variant_encoder_write_format_fields( return offset; } +static int64_t +vcz_variant_encoder_write_filter(const vcz_variant_encoder_t *self, size_t variant, + char *buf, size_t buflen, int64_t offset) +{ + const vcz_field_t filter_id = self->filter_id; + bool all_missing = true; + const int8_t *restrict data = self->filter_data + (variant * filter_id.num_columns); + const char *filter_id_data = (const char *) self->filter_id.data; + size_t j, k, source_offset; + + for (j = 0; j < filter_id.num_columns; j++) { + if (data[j]) { + all_missing = false; + } + } + if (all_missing) { + buf[offset] = '.'; + offset++; + } else { + source_offset = 0; + for (j = 0; j < filter_id.num_columns; j++) { + if (data[j]) { + source_offset = j * filter_id.item_size; + for (k = 0; k < filter_id.item_size; k++) { + if (filter_id_data[source_offset] == VCZ_STRING_FILL) { + break; + } + buf[offset] = filter_id_data[source_offset]; + offset++; + source_offset++; + } + } + } + } + buf[offset] = '\t'; + offset++; + return offset; +} + int64_t vcz_variant_encoder_write_row( const vcz_variant_encoder_t *self, size_t variant, char *buf, size_t buflen) @@ -431,11 +519,22 @@ vcz_variant_encoder_write_row( size_t j; for (j = 0; j < VCZ_NUM_FIXED_FIELDS; j++) { - offset = vcz_field_write(&self->fixed_fields[j], variant, buf, buflen, offset); - if (offset < 0) { - goto out; + if (vcz_info_field_is_missing(&self->fixed_fields[j], variant)) { + buf[offset] = '.'; + offset++; + buf[offset] = '\t'; + offset++; + } else { + offset = vcz_field_write(&self->fixed_fields[j], variant, buf, buflen, offset); + if (offset < 0) { + goto out; + } } } + offset = vcz_variant_encoder_write_filter(self, variant, buf, buflen, offset); + if (offset < 0) { + goto out; + } offset = vcz_variant_encoder_write_info_fields(self, variant, buf, buflen, offset); if (offset < 0) { goto out; @@ -458,6 +557,7 @@ vcz_variant_encoder_print_state(const vcz_variant_encoder_t *self, FILE *out) fprintf(out, "vcz_variant_encoder: %p\n", (void *) self); fprintf(out, "\tnum_samples: %d\n", (int) self->num_samples); fprintf(out, "\tnum_variants: %d\n", (int) self->num_variants); + vcz_field_print_state(&self->filter_id, out); for (j = 0; j < VCZ_NUM_FIXED_FIELDS; j++) { vcz_field_print_state(&self->fixed_fields[j], out); } @@ -485,6 +585,11 @@ vcz_variant_encoder_add_field(vcz_variant_encoder_t *self, vcz_field_t *field, ret = VCZ_ERR_FIELD_NAME_TOO_LONG; goto out; } + // TODO MORE CHECKS + if (type == VCZ_TYPE_BOOL && item_size != 1) { + ret = VCZ_ERR_FIELD_UNSUPPORTED_TYPE; + goto out; + } strcpy(field->name, name); field->type = type; field->item_size = item_size; @@ -512,9 +617,12 @@ vcz_variant_encoder_add_info_field(vcz_variant_encoder_t *self, const char *name self->info_fields = tmp; } field = self->info_fields + self->num_info_fields; - self->num_info_fields++; ret = vcz_variant_encoder_add_field( self, field, name, type, item_size, num_columns, data); + if (ret != 0) { + goto out; + } + self->num_info_fields++; out: return ret; } @@ -567,9 +675,9 @@ vcz_variant_encoder_init(vcz_variant_encoder_t *self, const char *id_data, size_t id_item_size, size_t id_num_columns, const char *ref_data, size_t ref_item_size, const char *alt_data, size_t alt_item_size, size_t alt_num_columns, - const int32_t *qual_data, - const char *filter_data, size_t filter_item_size, size_t filter_num_columns) - + const float *qual_data, + const char *filter_id_data, size_t filter_id_item_size, size_t filter_id_num_columns, + const int8_t *filter_data) { vcz_field_t fixed_fields[] = { { .name = "CHROM", @@ -598,24 +706,24 @@ vcz_variant_encoder_init(vcz_variant_encoder_t *self, .num_columns = alt_num_columns, .data = alt_data }, { .name = "QUAL", - .type = VCZ_TYPE_INT, + .type = VCZ_TYPE_FLOAT, .item_size = 4, .num_columns = 1, - .data = qual_data }, - { .name = "FILTER", - .type = VCZ_TYPE_STRING, - .item_size = filter_item_size, - .num_columns = filter_num_columns, - .data = filter_data } }; - + .data = qual_data } + }; // clang-format on memset(self, 0, sizeof(*self)); self->num_samples = num_samples; self->num_variants = num_variants; - self->field_array_size_increment = 64; + self->field_array_size_increment = 64; // arbitrary + strcpy(self->filter_id.name, "IDS/FILTERS"); + self->filter_id.type = VCZ_TYPE_STRING; + self->filter_id.item_size = filter_id_item_size; + self->filter_id.data = filter_id_data; + self->filter_id.num_columns = filter_id_num_columns; + self->filter_data = filter_data; memcpy(self->fixed_fields, fixed_fields, sizeof(fixed_fields)); - return 0; } diff --git a/lib/vcf_encoder.h b/lib/vcf_encoder.h index 421816f..79f9a52 100644 --- a/lib/vcf_encoder.h +++ b/lib/vcf_encoder.h @@ -6,12 +6,16 @@ #define VCZ_INT_FILL -2 #define VCZ_STRING_MISSING '.' #define VCZ_STRING_FILL '\0' +#define VCZ_FLOAT32_MISSING_AS_INT32 0x7F800001 +#define VCZ_FLOAT32_FILL_AS_INT32 0x7F800002 -#define VCZ_NUM_FIXED_FIELDS 7 +#define VCZ_NUM_FIXED_FIELDS 6 #define VCZ_TYPE_INT 0 -#define VCZ_TYPE_FLOAT 0 +#define VCZ_TYPE_FLOAT 1 #define VCZ_TYPE_STRING 2 +#define VCZ_TYPE_BOOL 3 + // arbitrary - we can increase if needs be #define VCZ_MAX_FIELD_NAME_LEN 256 @@ -39,11 +43,13 @@ typedef struct { size_t num_variants; size_t num_samples; vcz_field_t fixed_fields[VCZ_NUM_FIXED_FIELDS]; + vcz_field_t filter_id; + const int8_t *filter_data; vcz_field_t gt; + const int8_t *gt_phased_data; size_t num_info_fields; size_t max_info_fields; vcz_field_t *info_fields; - const int8_t *gt_phased_data; size_t num_format_fields; size_t max_format_fields; size_t field_array_size_increment; @@ -58,8 +64,9 @@ int vcz_variant_encoder_init(vcz_variant_encoder_t *self, const char *id_data, size_t id_item_size, size_t id_num_columns, const char *ref_data, size_t ref_item_size, const char *alt_data, size_t alt_item_size, size_t alt_num_columns, - const int32_t *qual_data, const char *filter_data, size_t filter_item_size, - size_t filter_num_columns + const float *qual_data, + const char *filter_id_data, size_t filter_id_item_size, size_t filter_id_num_columns, + const int8_t *filter_data ); // clang-format on diff --git a/pyproject.toml b/pyproject.toml index 87e177a..1b53341 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,12 @@ build-backend = "setuptools.build_meta" [project] name = "vcztools" -version = "0.0.1" +description = "Implementation of bcftools for VCF Zarr" +readme = "README.md" +license = {file = "LICENSE"} +authors = [ + {name = "sgkit Developers", email = "project@sgkit.dev"}, +] dependencies = [ "numpy>=1.23.5,<2", "numba", @@ -12,6 +17,7 @@ dependencies = [ "click", ] requires-python = ">=3.9" +dynamic = ["version"] [project.scripts] vcztools = "vcztools.cli:vcztools_main" @@ -29,3 +35,6 @@ packages = ["vcztools"] [tool.pytest.ini_options] testpaths = ["tests"] + +[tool.setuptools_scm] +write_to = "vcztools/_version.py" diff --git a/setup.py b/setup.py index a4e9d2f..a972a27 100644 --- a/setup.py +++ b/setup.py @@ -3,12 +3,13 @@ from setuptools import setup _vcztools_module = Extension( - "_vcztools", - sources=["_vcztoolsmodule.c", "lib/vcf_encoder.c"], + "vcztools._vcztools", + sources=["vcztools/_vcztoolsmodule.c", "lib/vcf_encoder.c"], extra_compile_args=["-std=c99"], include_dirs=["lib", numpy.get_include()], ) setup( + name="vcztools", ext_modules=[_vcztools_module], ) diff --git a/tests/test_vcf_roundtrip.py b/tests/test_vcf_roundtrip.py index 26e5921..adcd863 100644 --- a/tests/test_vcf_roundtrip.py +++ b/tests/test_vcf_roundtrip.py @@ -8,7 +8,8 @@ @pytest.mark.parametrize( "vcf_file", ["sample.vcf.gz"] ) -def test_vcf_to_zarr_to_vcf__real_files(shared_datadir, tmp_path, vcf_file): +@pytest.mark.parametrize("implementation", ["c", "numba"]) +def test_vcf_to_zarr_to_vcf__real_files(shared_datadir, tmp_path, vcf_file, implementation): path = shared_datadir / "vcf" / vcf_file intermediate_icf = tmp_path.joinpath("intermediate.icf") intermediate_vcz = tmp_path.joinpath("intermediate.vcz") @@ -18,6 +19,6 @@ def test_vcf_to_zarr_to_vcf__real_files(shared_datadir, tmp_path, vcf_file): [path], intermediate_vcz, icf_path=intermediate_icf, worker_processes=0 ) - write_vcf(intermediate_vcz, output) + write_vcf(intermediate_vcz, output, implementation=implementation) assert_vcfs_close(path, output) diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py index 69cfc97..d577b6f 100644 --- a/tests/test_vcf_writer.py +++ b/tests/test_vcf_writer.py @@ -11,7 +11,8 @@ @pytest.mark.parametrize("output_is_path", [True, False]) -def test_write_vcf(shared_datadir, tmp_path, output_is_path): +@pytest.mark.parametrize("implementation", ["c", "numba"]) +def test_write_vcf(shared_datadir, tmp_path, output_is_path, implementation): path = shared_datadir / "vcf" / "sample.vcf.gz" intermediate_icf = tmp_path.joinpath("intermediate.icf") intermediate_vcz = tmp_path.joinpath("intermediate.vcz") @@ -22,10 +23,10 @@ def test_write_vcf(shared_datadir, tmp_path, output_is_path): ) if output_is_path: - write_vcf(intermediate_vcz, output) + write_vcf(intermediate_vcz, output, implementation=implementation) else: output_str = StringIO() - write_vcf(intermediate_vcz, output_str) + write_vcf(intermediate_vcz, output_str, implementation=implementation) with open(output, "w") as f: f.write(output_str.getvalue()) diff --git a/tests/utils.py b/tests/utils.py index ba30b3f..b893841 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -80,7 +80,8 @@ def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03): else: assert val1 == val2, f"INFO {k} not equal for variants\n{v1}{v2}" - assert v1.FORMAT == v2.FORMAT, f"FORMAT not equal for variants\n{v1}{v2}" + # NOTE skipping this because it requires items to be in the same order. + # assert v1.FORMAT == v2.FORMAT, f"FORMAT not equal for variants\n{v1}{v2}" for field in v1.FORMAT: if field == "GT": assert ( diff --git a/_vcztoolsmodule.c b/vcztools/_vcztoolsmodule.c similarity index 84% rename from _vcztoolsmodule.c rename to vcztools/_vcztoolsmodule.c index 6a86355..89eecc5 100644 --- a/_vcztoolsmodule.c +++ b/vcztools/_vcztoolsmodule.c @@ -135,14 +135,18 @@ static int VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) { int ret = -1; - static char *kwlist[] - = { "num_variants", "num_samples", "chrom", "pos", "id", "ref", "alt", NULL }; + static char *kwlist[] = { "num_variants", "num_samples", "chrom", "pos", "id", "ref", + "alt", "qual", "filter_ids", "filter", NULL }; int num_variants, num_samples; PyArrayObject *chrom = NULL; PyArrayObject *pos = NULL; PyArrayObject *id = NULL; PyArrayObject *ref = NULL; PyArrayObject *alt = NULL; + PyArrayObject *qual = NULL; + PyArrayObject *filter_ids = NULL; + PyArrayObject *filter = NULL; + npy_intp num_filters; int err; self->vcf_encoder = NULL; @@ -154,11 +158,13 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) goto out; } - if (!PyArg_ParseTupleAndKeywords(args, kwds, "iiO!O!O!O!O!", kwlist, &num_variants, - &num_samples, &PyArray_Type, &chrom, &PyArray_Type, &pos, &PyArray_Type, &id, - &PyArray_Type, &ref, &PyArray_Type, &alt)) { + if (!PyArg_ParseTupleAndKeywords(args, kwds, "iiO!O!O!O!O!O!O!O!", kwlist, + &num_variants, &num_samples, &PyArray_Type, &chrom, &PyArray_Type, &pos, + &PyArray_Type, &id, &PyArray_Type, &ref, &PyArray_Type, &alt, &PyArray_Type, + &qual, &PyArray_Type, &filter_ids, &PyArray_Type, &filter)) { goto out; } + if (VcfEncoder_store_fixed_array(self, chrom, "CHROM", NPY_STRING, 1, num_variants) != 0) { goto out; @@ -178,17 +184,53 @@ VcfEncoder_init(VcfEncoder *self, PyObject *args, PyObject *kwds) != 0) { goto out; } + if (VcfEncoder_store_fixed_array(self, qual, "QUAL", NPY_FLOAT32, 1, num_variants) + != 0) { + goto out; + } + + /* NOTE: we generalise this pattern for CHROM also to save a bit of time + * in building numpy String arrays */ + assert(PyArray_CheckExact(filter_ids)); + if (!PyArray_CHKFLAGS(filter_ids, NPY_ARRAY_IN_ARRAY)) { + PyErr_SetString(PyExc_ValueError, "Array must have NPY_ARRAY_IN_ARRAY flags."); + goto out; + } + if (PyArray_DTYPE(filter_ids)->type_num != NPY_STRING) { + PyErr_Format(PyExc_ValueError, "filter_ids is not of the correct type"); + goto out; + } + + if (PyArray_NDIM(filter_ids) != 1) { + PyErr_Format(PyExc_ValueError, "filter_ids must be 1D"); + goto out; + } + num_filters = PyArray_DIMS(filter_ids)[0]; + if (VcfEncoder_add_array(self, "IDS/", "FILTER", filter_ids) != 0) { + goto out; + } + if (VcfEncoder_store_fixed_array(self, filter, "filter", NPY_BOOL, 2, num_variants) + != 0) { + goto out; + } + if (PyArray_DIMS(filter)[1] != num_filters) { + PyErr_Format( + PyExc_ValueError, "filters dimension must be (num_variants, num_filters)"); + goto out; + } // clang-format off err = vcz_variant_encoder_init( - self->vcf_encoder, num_samples, num_variants, - PyArray_DATA(chrom), - PyArray_ITEMSIZE(chrom), PyArray_DATA(pos), + self->vcf_encoder, + num_samples, num_variants, + PyArray_DATA(chrom), PyArray_ITEMSIZE(chrom), + PyArray_DATA(pos), PyArray_DATA(id), PyArray_ITEMSIZE(id), PyArray_DIMS(id)[1], PyArray_DATA(ref), PyArray_ITEMSIZE(ref), PyArray_DATA(alt), PyArray_ITEMSIZE(alt), PyArray_DIMS(alt)[1], - PyArray_DATA(pos), - PyArray_DATA(chrom), PyArray_ITEMSIZE(chrom), 1 // FIXME + PyArray_DATA(qual), + PyArray_DATA(filter_ids), PyArray_ITEMSIZE(filter_ids), num_filters, + PyArray_DATA(filter) ); // clang-format on if (err < 0) { @@ -237,7 +279,7 @@ VcfEncoder_add_field_array(VcfEncoder *self, const char *name, PyArrayObject *ar } static int -np_type_to_vcz_type(PyArrayObject *array) +np_type_to_vcz_type(const char *name, PyArrayObject *array) { int ret = -1; @@ -245,11 +287,18 @@ np_type_to_vcz_type(PyArrayObject *array) case 'i': ret = VCZ_TYPE_INT; break; + case 'f': + ret = VCZ_TYPE_FLOAT; + break; case 'S': ret = VCZ_TYPE_STRING; break; + case 'b': + ret = VCZ_TYPE_BOOL; + break; default: - PyErr_SetString(PyExc_ValueError, "Unsupported array dtype"); + PyErr_Format( + PyExc_ValueError, "Array '%s' has unsupported array dtype", name); goto out; } out: @@ -270,7 +319,7 @@ VcfEncoder_add_info_field(VcfEncoder *self, PyObject *args) if (VcfEncoder_add_field_array(self, name, array, 2, "INFO/") != 0) { goto out; } - type = np_type_to_vcz_type(array); + type = np_type_to_vcz_type(name, array); if (type < 0) { goto out; } @@ -300,7 +349,7 @@ VcfEncoder_add_format_field(VcfEncoder *self, PyObject *args) if (VcfEncoder_add_field_array(self, name, array, 3, "FORMAT/") != 0) { goto out; } - type = np_type_to_vcz_type(array); + type = np_type_to_vcz_type(name, array); if (type < 0) { goto out; } diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index db5d5f4..3ca6819 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -7,7 +7,7 @@ import numpy as np import zarr -import _vcztools +from . import _vcztools from .constants import FLOAT32_MISSING, RESERVED_VARIABLE_NAMES from .vcf_writer_utils import ( @@ -192,7 +192,7 @@ def c_chunk_to_vcf(root, v_chunk, contigs, filters, output): ref = alleles[:, 0].astype("S") alt = alleles[:, 1:].astype("S") qual = root.variant_quality.blocks[v_chunk] - # filter_ = filters[root.variant_filter.blocks[v_chunk]] + filter_ = root.variant_filter.blocks[v_chunk] num_variants = len(pos) if len(id.shape) == 1: @@ -226,8 +226,18 @@ def c_chunk_to_vcf(root, v_chunk, contigs, filters, output): num_samples = gt.shape[1] encoder = _vcztools.VcfEncoder( - num_variants, num_samples, chrom=chrom, pos=pos, id=id, alt=alt, ref=ref + num_variants, + num_samples, + chrom=chrom, + pos=pos, + id=id, + alt=alt, + ref=ref, + qual=qual, + filter_ids=filters, + filter=filter_, ) + # print(encoder.arrays) encoder.add_gt_field(gt.astype("int32"), gt_phased) for name, array in info_fields.items(): if array.dtype.kind == "O": @@ -236,11 +246,6 @@ def c_chunk_to_vcf(root, v_chunk, contigs, filters, output): array = array.reshape((num_variants, 1)) if array.dtype.kind == "i": array = array.astype("int32") # tmp - if array.dtype.kind == "f": - continue # tmp - if array.dtype.kind == "b": - continue # tmp - # array = array.astype("int32") # tmp encoder.add_info_field(name, array) for name, array in format_fields.items(): @@ -250,22 +255,16 @@ def c_chunk_to_vcf(root, v_chunk, contigs, filters, output): array = array.reshape((num_variants, num_samples, 1)) if array.dtype.kind == "i": array = array.astype("int32") # tmp - if array.dtype.kind == "f": - continue # tmp - # array = array.astype("int32") # tmp encoder.add_format_field(name, array) for j in range(num_variants): line = encoder.encode_row(j, 2**30) - print(line) + print(line, file=output) def numba_chunk_to_vcf( root, v_chunk, header_info_fields, header_format_fields, contigs, filters, output ): - # n_variants = ds.sizes["variants"] # number of variants in this chunk - # n_samples = ds.sizes["samples"] # number of samples in whole dataset - # fixed fields chrom = root.variant_contig.blocks[v_chunk]