diff --git a/lib/meson.build b/lib/meson.build index 5a7435b..1b5b3b5 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -20,5 +20,5 @@ cunit_dep = dependency('cunit') tests = executable('tests', sources: ['tests.c', 'vcf_encoder.c'], - dependencies: cunit_dep, + dependencies: [cunit_dep, m_dep], ) diff --git a/lib/tests.c b/lib/tests.c index 27437d0..e34cc7e 100644 --- a/lib/tests.c +++ b/lib/tests.c @@ -179,13 +179,63 @@ test_itoa_small(void) for (j = -255; j <= 256; j++) { len1 = sprintf(dest1, "%d", j); - len2 = vcz_itoa(j, dest2); + len2 = vcz_itoa(dest2, j); /* printf("%s %s\n", dest1, dest2); */ CU_ASSERT_STRING_EQUAL(dest1, dest2); CU_ASSERT_EQUAL(len1, len2); } } + +static void +test_ftoa(void) +{ + struct test_case { + float val; + const char *expected; + }; + struct test_case cases[] = { + {0.0, "0"}, + {0.0001, "0"}, + {0.0005, "0.001"}, + {0.3, "0.3"}, + {0.32, "0.32"}, + {0.329, "0.329"}, + {0.3217, "0.322"}, + {8.0, "8"}, + {8.0001, "8"}, + {8.3, "8.3"}, + {8.32, "8.32"}, + {8.329, "8.329"}, + {8.3217, "8.322"}, + {443.998, "443.998"}, + {1028.0, "1028"}, + {1028.0001, "1028"}, + {1028.3, "1028.3"}, + {1028.32, "1028.32"}, + {1028.329, "1028.329"}, + {1028.3217, "1028.322"}, + {1000000, "1000000"}, + {-100.0, "-100"}, + {NAN, "nan"}, + {INFINITY, "inf"}, + {-INFINITY, "-inf"}, + {2311380, "2311380"}, + {16777216, "16777216"}, /* Maximum integer value of float */ + {-16777216, "-16777216"}, + /* TODO test extreme value here, that push against the limits of f32 */ + }; + int j, len; + char buf[1024]; + + for (j = 0; j < sizeof(cases) / sizeof(*cases); j++) { + len = vcz_ftoa(buf, cases[j].val); + /* printf("j = %d %f->%s=='%s'\n", j, cases[j].val, cases[j].expected, buf); */ + CU_ASSERT_EQUAL_FATAL(len, strlen(cases[j].expected)); + CU_ASSERT_STRING_EQUAL_FATAL(buf, cases[j].expected); + } +} + /*================================================= Test suite management ================================================= @@ -278,6 +328,7 @@ main(int argc, char **argv) { "test_int_field_2d", test_int_field_2d }, { "test_variant_encoder_minimal", test_variant_encoder_minimal }, { "test_itoa_small", test_itoa_small }, + { "test_ftoa", test_ftoa }, { NULL, NULL }, }; return test_main(tests, argc, argv); diff --git a/lib/vcf_encoder.c b/lib/vcf_encoder.c index 97aaf22..923465a 100644 --- a/lib/vcf_encoder.c +++ b/lib/vcf_encoder.c @@ -5,9 +5,10 @@ #include #include #include +#include int -vcz_itoa(int32_t value, char *buf) +vcz_itoa(char * buf, int32_t value) { int p = 0; int j, k; @@ -55,6 +56,52 @@ vcz_itoa(int32_t value, char *buf) return p; } +int +vcz_ftoa(char * buf, float value) +{ + int p = 0; + int64_t i, d1, d2, d3; + + if (isnan(value)) { + strcpy(buf, "nan"); + return p + 3; + } + if (value < 0) { + buf[p] = '-'; + p++; + value = -value; + } + if (isinf(value)) { + strcpy(buf + p, "inf"); + return p + 3; + } + + /* integer part */ + i = (int64_t) round(((double) value) * 1000); + p += vcz_itoa(buf + p, i / 1000); + + /* fractional part */ + d3 = i % 10; + d2 = (i / 10) % 10; + d1 = (i / 100) % 10; + if (d1 + d2 + d3 > 0) { + buf[p] = '.'; + p ++; + buf[p] = d1 + '0'; + p ++; + if (d2 + d3 > 0) { + buf[p] = d2 + '0'; + p ++; + if (d3 > 0) { + buf[p] = d3 + '0'; + p ++; + } + } + } + buf[p] = '\0'; + return p; +} + static bool bool_all_missing(const int8_t *restrict data, size_t n) { @@ -146,8 +193,6 @@ int32_field_write_entry( const int32_t *source = (int32_t *) data; int32_t value; size_t column; - /* int written; */ - /* char value_buffer[128]; */ for (column = 0; column < self->num_columns; column++) { value = source[column]; @@ -160,11 +205,7 @@ int32_field_write_entry( dest[offset] = '.'; offset++; } else { - offset += vcz_itoa(value, dest + offset); - /* written = snprintf(value_buffer, sizeof(value_buffer), "%d", value); - */ - /* memcpy(dest + offset, value_buffer, written); */ - /* offset += written; */ + offset += vcz_itoa(dest + offset, value); } } } @@ -183,8 +224,6 @@ float32_field_write_entry( int32_t int32_value; float value; size_t column; - int written; - char value_buffer[128]; for (column = 0; column < self->num_columns; column++) { int32_value = int32_source[column]; @@ -199,11 +238,8 @@ float32_field_write_entry( dest[offset] = '.'; offset++; } else { - /* offset += vcz_itoa(value, dest + offset); */ value = source[column]; - written = snprintf(value_buffer, sizeof(value_buffer), "%.3g", value); - memcpy(dest + offset, value_buffer, written); - offset += written; + offset += vcz_ftoa(dest + offset, value); } } dest[offset] = '\t'; @@ -320,11 +356,7 @@ vcz_variant_encoder_write_sample_gt(const vcz_variant_encoder_t *self, size_t va dest[offset] = '.'; offset++; } else { - offset += vcz_itoa(value, dest + offset); - /* written = snprintf(value_buffer, sizeof(value_buffer), "%d", value); - */ - /* memcpy(dest + offset, value_buffer, written); */ - /* offset += written; */ + offset += vcz_itoa(dest + offset, value); } } } diff --git a/lib/vcf_encoder.h b/lib/vcf_encoder.h index 79f9a52..a9b4307 100644 --- a/lib/vcf_encoder.h +++ b/lib/vcf_encoder.h @@ -84,4 +84,5 @@ int vcz_variant_encoder_add_info_field(vcz_variant_encoder_t *self, int64_t vcz_variant_encoder_write_row( const vcz_variant_encoder_t *self, size_t row, char *buf, size_t buflen); -int vcz_itoa(int32_t v, char *out); +int vcz_itoa(char *buf, int32_t v); +int vcz_ftoa(char *buf, float v); diff --git a/tests/data/vcf/1kg_2020_chr20_annotations.bcf b/tests/data/vcf/1kg_2020_chr20_annotations.bcf new file mode 100644 index 0000000..457b905 Binary files /dev/null and b/tests/data/vcf/1kg_2020_chr20_annotations.bcf differ diff --git a/tests/data/vcf/1kg_2020_chr20_annotations.bcf.csi b/tests/data/vcf/1kg_2020_chr20_annotations.bcf.csi new file mode 100644 index 0000000..c543abf Binary files /dev/null and b/tests/data/vcf/1kg_2020_chr20_annotations.bcf.csi differ diff --git a/tests/data/vcf/1kg_2020_chrM.vcf.gz b/tests/data/vcf/1kg_2020_chrM.vcf.gz new file mode 100644 index 0000000..c0b6cf3 Binary files /dev/null and b/tests/data/vcf/1kg_2020_chrM.vcf.gz differ diff --git a/tests/data/vcf/1kg_2020_chrM.vcf.gz.csi b/tests/data/vcf/1kg_2020_chrM.vcf.gz.csi new file mode 100644 index 0000000..66b4cea Binary files /dev/null and b/tests/data/vcf/1kg_2020_chrM.vcf.gz.csi differ diff --git a/tests/test_vcf_roundtrip.py b/tests/test_vcf_roundtrip.py index adcd863..1f8a804 100644 --- a/tests/test_vcf_roundtrip.py +++ b/tests/test_vcf_roundtrip.py @@ -1,3 +1,5 @@ +import pathlib + import pytest from bio2zarr import vcf2zarr @@ -5,20 +7,28 @@ from .utils import assert_vcfs_close +def vcz_path_cache(vcf_path): + """ + Store converted files in a cache to speed up tests. We're not testing + vcf2zarr here, so no point in running over and over again. + """ + cache_path = pathlib.Path("vcz_test_cache") + if not cache_path.exists(): + cache_path.mkdir() + cached_vcz_path = (cache_path / vcf_path.name).with_suffix(".vcz") + if not cached_vcz_path.exists(): + vcf2zarr.convert([vcf_path], cached_vcz_path, worker_processes=0) + return cached_vcz_path + + @pytest.mark.parametrize( - "vcf_file", ["sample.vcf.gz"] + "vcf_file", + ["sample.vcf.gz", "1kg_2020_chr20_annotations.bcf", "1kg_2020_chrM.vcf.gz"], ) @pytest.mark.parametrize("implementation", ["c", "numba"]) -def test_vcf_to_zarr_to_vcf__real_files(shared_datadir, tmp_path, vcf_file, implementation): - path = shared_datadir / "vcf" / vcf_file - intermediate_icf = tmp_path.joinpath("intermediate.icf") - intermediate_vcz = tmp_path.joinpath("intermediate.vcz") - output = tmp_path.joinpath("output.vcf") - - vcf2zarr.convert( - [path], intermediate_vcz, icf_path=intermediate_icf, worker_processes=0 - ) - - write_vcf(intermediate_vcz, output, implementation=implementation) - - assert_vcfs_close(path, output) +def test_vcf_to_zarr_to_vcf__real_files(tmp_path, vcf_file, implementation): + original = pathlib.Path("tests/data/vcf") / vcf_file + vcz = vcz_path_cache(original) + generated = tmp_path.joinpath("output.vcf") + write_vcf(vcz, generated, implementation=implementation) + assert_vcfs_close(original, generated) diff --git a/tests/utils.py b/tests/utils.py index b893841..fa4b296 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -90,17 +90,24 @@ def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03): else: val1 = v1.format(field) val2 = v2.format(field) - if val1.dtype.kind == "f": - np.testing.assert_allclose( - val1, - val2, - rtol=rtol, - atol=atol, - err_msg=f"FORMAT {field} not equal for variants\n{v1}{v2}", - ) + if val2 is None: + # FIXME this is a quick hack to workaround missing support for + # dealing with the field missing vs all-elements-in-field missing + # issue. + # https://github.com/jeromekelleher/vcztools/issues/14 + assert [str(x) == "." for x in val1] else: - np.testing.assert_array_equal( - val1, - val2, - err_msg=f"FORMAT {field} not equal for variants\n{v1}{v2}", - ) + if val1.dtype.kind == "f": + np.testing.assert_allclose( + val1, + val2, + rtol=rtol, + atol=atol, + err_msg=f"FORMAT {field} not equal for variants\n{v1}{v2}", + ) + else: + np.testing.assert_array_equal( + val1, + val2, + err_msg=f"FORMAT {field} not equal for variants\n{v1}{v2}", + ) diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py index 3ca6819..351365f 100644 --- a/vcztools/vcf_writer.py +++ b/vcztools/vcf_writer.py @@ -186,7 +186,8 @@ def write_vcf( def c_chunk_to_vcf(root, v_chunk, contigs, filters, output): chrom = contigs[root.variant_contig.blocks[v_chunk]] - pos = root.variant_position.blocks[v_chunk] + # TODO check we don't truncate silently by doing this + pos = root.variant_position.blocks[v_chunk].astype(np.int32) id = root.variant_id.blocks[v_chunk].astype("S") alleles = root.variant_allele.blocks[v_chunk] ref = alleles[:, 0].astype("S") @@ -238,7 +239,8 @@ def c_chunk_to_vcf(root, v_chunk, contigs, filters, output): filter=filter_, ) # print(encoder.arrays) - encoder.add_gt_field(gt.astype("int32"), gt_phased) + if gt is not None: + encoder.add_gt_field(gt.astype("int32"), gt_phased) for name, array in info_fields.items(): if array.dtype.kind == "O": array = array.astype("S")