Skip to content

Commit

Permalink
Merge pull request #13 from jeromekelleher/more-real-examples
Browse files Browse the repository at this point in the history
More real examples
  • Loading branch information
jeromekelleher authored Jul 5, 2024
2 parents 591734f + 5d31345 commit 3382a02
Show file tree
Hide file tree
Showing 11 changed files with 154 additions and 51 deletions.
2 changes: 1 addition & 1 deletion lib/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@ cunit_dep = dependency('cunit')

tests = executable('tests',
sources: ['tests.c', 'vcf_encoder.c'],
dependencies: cunit_dep,
dependencies: [cunit_dep, m_dep],
)
53 changes: 52 additions & 1 deletion lib/tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,63 @@ test_itoa_small(void)

for (j = -255; j <= 256; j++) {
len1 = sprintf(dest1, "%d", j);
len2 = vcz_itoa(j, dest2);
len2 = vcz_itoa(dest2, j);
/* printf("%s %s\n", dest1, dest2); */
CU_ASSERT_STRING_EQUAL(dest1, dest2);
CU_ASSERT_EQUAL(len1, len2);
}
}


static void
test_ftoa(void)
{
struct test_case {
float val;
const char *expected;
};
struct test_case cases[] = {
{0.0, "0"},
{0.0001, "0"},
{0.0005, "0.001"},
{0.3, "0.3"},
{0.32, "0.32"},
{0.329, "0.329"},
{0.3217, "0.322"},
{8.0, "8"},
{8.0001, "8"},
{8.3, "8.3"},
{8.32, "8.32"},
{8.329, "8.329"},
{8.3217, "8.322"},
{443.998, "443.998"},
{1028.0, "1028"},
{1028.0001, "1028"},
{1028.3, "1028.3"},
{1028.32, "1028.32"},
{1028.329, "1028.329"},
{1028.3217, "1028.322"},
{1000000, "1000000"},
{-100.0, "-100"},
{NAN, "nan"},
{INFINITY, "inf"},
{-INFINITY, "-inf"},
{2311380, "2311380"},
{16777216, "16777216"}, /* Maximum integer value of float */
{-16777216, "-16777216"},
/* TODO test extreme value here, that push against the limits of f32 */
};
int j, len;
char buf[1024];

for (j = 0; j < sizeof(cases) / sizeof(*cases); j++) {
len = vcz_ftoa(buf, cases[j].val);
/* printf("j = %d %f->%s=='%s'\n", j, cases[j].val, cases[j].expected, buf); */
CU_ASSERT_EQUAL_FATAL(len, strlen(cases[j].expected));
CU_ASSERT_STRING_EQUAL_FATAL(buf, cases[j].expected);
}
}

/*=================================================
Test suite management
=================================================
Expand Down Expand Up @@ -278,6 +328,7 @@ main(int argc, char **argv)
{ "test_int_field_2d", test_int_field_2d },
{ "test_variant_encoder_minimal", test_variant_encoder_minimal },
{ "test_itoa_small", test_itoa_small },
{ "test_ftoa", test_ftoa },
{ NULL, NULL },
};
return test_main(tests, argc, argv);
Expand Down
70 changes: 51 additions & 19 deletions lib/vcf_encoder.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
#include <string.h>
#include <stdbool.h>
#include <stdlib.h>
#include <math.h>

int
vcz_itoa(int32_t value, char *buf)
vcz_itoa(char * buf, int32_t value)
{
int p = 0;
int j, k;
Expand Down Expand Up @@ -55,6 +56,52 @@ vcz_itoa(int32_t value, char *buf)
return p;
}

int
vcz_ftoa(char * buf, float value)
{
int p = 0;
int64_t i, d1, d2, d3;

if (isnan(value)) {
strcpy(buf, "nan");
return p + 3;
}
if (value < 0) {
buf[p] = '-';
p++;
value = -value;
}
if (isinf(value)) {
strcpy(buf + p, "inf");
return p + 3;
}

/* integer part */
i = (int64_t) round(((double) value) * 1000);
p += vcz_itoa(buf + p, i / 1000);

/* fractional part */
d3 = i % 10;
d2 = (i / 10) % 10;
d1 = (i / 100) % 10;
if (d1 + d2 + d3 > 0) {
buf[p] = '.';
p ++;
buf[p] = d1 + '0';
p ++;
if (d2 + d3 > 0) {
buf[p] = d2 + '0';
p ++;
if (d3 > 0) {
buf[p] = d3 + '0';
p ++;
}
}
}
buf[p] = '\0';
return p;
}

static bool
bool_all_missing(const int8_t *restrict data, size_t n)
{
Expand Down Expand Up @@ -146,8 +193,6 @@ int32_field_write_entry(
const int32_t *source = (int32_t *) data;
int32_t value;
size_t column;
/* int written; */
/* char value_buffer[128]; */

for (column = 0; column < self->num_columns; column++) {
value = source[column];
Expand All @@ -160,11 +205,7 @@ int32_field_write_entry(
dest[offset] = '.';
offset++;
} else {
offset += vcz_itoa(value, dest + offset);
/* written = snprintf(value_buffer, sizeof(value_buffer), "%d", value);
*/
/* memcpy(dest + offset, value_buffer, written); */
/* offset += written; */
offset += vcz_itoa(dest + offset, value);
}
}
}
Expand All @@ -183,8 +224,6 @@ float32_field_write_entry(
int32_t int32_value;
float value;
size_t column;
int written;
char value_buffer[128];

for (column = 0; column < self->num_columns; column++) {
int32_value = int32_source[column];
Expand All @@ -199,11 +238,8 @@ float32_field_write_entry(
dest[offset] = '.';
offset++;
} else {
/* offset += vcz_itoa(value, dest + offset); */
value = source[column];
written = snprintf(value_buffer, sizeof(value_buffer), "%.3g", value);
memcpy(dest + offset, value_buffer, written);
offset += written;
offset += vcz_ftoa(dest + offset, value);
}
}
dest[offset] = '\t';
Expand Down Expand Up @@ -320,11 +356,7 @@ vcz_variant_encoder_write_sample_gt(const vcz_variant_encoder_t *self, size_t va
dest[offset] = '.';
offset++;
} else {
offset += vcz_itoa(value, dest + offset);
/* written = snprintf(value_buffer, sizeof(value_buffer), "%d", value);
*/
/* memcpy(dest + offset, value_buffer, written); */
/* offset += written; */
offset += vcz_itoa(dest + offset, value);
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion lib/vcf_encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,5 @@ int vcz_variant_encoder_add_info_field(vcz_variant_encoder_t *self,
int64_t vcz_variant_encoder_write_row(
const vcz_variant_encoder_t *self, size_t row, char *buf, size_t buflen);

int vcz_itoa(int32_t v, char *out);
int vcz_itoa(char *buf, int32_t v);
int vcz_ftoa(char *buf, float v);
Binary file added tests/data/vcf/1kg_2020_chr20_annotations.bcf
Binary file not shown.
Binary file added tests/data/vcf/1kg_2020_chr20_annotations.bcf.csi
Binary file not shown.
Binary file added tests/data/vcf/1kg_2020_chrM.vcf.gz
Binary file not shown.
Binary file added tests/data/vcf/1kg_2020_chrM.vcf.gz.csi
Binary file not shown.
38 changes: 24 additions & 14 deletions tests/test_vcf_roundtrip.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,34 @@
import pathlib

import pytest

from bio2zarr import vcf2zarr
from vcztools.vcf_writer import write_vcf
from .utils import assert_vcfs_close


def vcz_path_cache(vcf_path):
"""
Store converted files in a cache to speed up tests. We're not testing
vcf2zarr here, so no point in running over and over again.
"""
cache_path = pathlib.Path("vcz_test_cache")
if not cache_path.exists():
cache_path.mkdir()
cached_vcz_path = (cache_path / vcf_path.name).with_suffix(".vcz")
if not cached_vcz_path.exists():
vcf2zarr.convert([vcf_path], cached_vcz_path, worker_processes=0)
return cached_vcz_path


@pytest.mark.parametrize(
"vcf_file", ["sample.vcf.gz"]
"vcf_file",
["sample.vcf.gz", "1kg_2020_chr20_annotations.bcf", "1kg_2020_chrM.vcf.gz"],
)
@pytest.mark.parametrize("implementation", ["c", "numba"])
def test_vcf_to_zarr_to_vcf__real_files(shared_datadir, tmp_path, vcf_file, implementation):
path = shared_datadir / "vcf" / vcf_file
intermediate_icf = tmp_path.joinpath("intermediate.icf")
intermediate_vcz = tmp_path.joinpath("intermediate.vcz")
output = tmp_path.joinpath("output.vcf")

vcf2zarr.convert(
[path], intermediate_vcz, icf_path=intermediate_icf, worker_processes=0
)

write_vcf(intermediate_vcz, output, implementation=implementation)

assert_vcfs_close(path, output)
def test_vcf_to_zarr_to_vcf__real_files(tmp_path, vcf_file, implementation):
original = pathlib.Path("tests/data/vcf") / vcf_file
vcz = vcz_path_cache(original)
generated = tmp_path.joinpath("output.vcf")
write_vcf(vcz, generated, implementation=implementation)
assert_vcfs_close(original, generated)
33 changes: 20 additions & 13 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,17 +90,24 @@ def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03):
else:
val1 = v1.format(field)
val2 = v2.format(field)
if val1.dtype.kind == "f":
np.testing.assert_allclose(
val1,
val2,
rtol=rtol,
atol=atol,
err_msg=f"FORMAT {field} not equal for variants\n{v1}{v2}",
)
if val2 is None:
# FIXME this is a quick hack to workaround missing support for
# dealing with the field missing vs all-elements-in-field missing
# issue.
# https://github.com/jeromekelleher/vcztools/issues/14
assert [str(x) == "." for x in val1]
else:
np.testing.assert_array_equal(
val1,
val2,
err_msg=f"FORMAT {field} not equal for variants\n{v1}{v2}",
)
if val1.dtype.kind == "f":
np.testing.assert_allclose(
val1,
val2,
rtol=rtol,
atol=atol,
err_msg=f"FORMAT {field} not equal for variants\n{v1}{v2}",
)
else:
np.testing.assert_array_equal(
val1,
val2,
err_msg=f"FORMAT {field} not equal for variants\n{v1}{v2}",
)
6 changes: 4 additions & 2 deletions vcztools/vcf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ def write_vcf(

def c_chunk_to_vcf(root, v_chunk, contigs, filters, output):
chrom = contigs[root.variant_contig.blocks[v_chunk]]
pos = root.variant_position.blocks[v_chunk]
# TODO check we don't truncate silently by doing this
pos = root.variant_position.blocks[v_chunk].astype(np.int32)
id = root.variant_id.blocks[v_chunk].astype("S")
alleles = root.variant_allele.blocks[v_chunk]
ref = alleles[:, 0].astype("S")
Expand Down Expand Up @@ -238,7 +239,8 @@ def c_chunk_to_vcf(root, v_chunk, contigs, filters, output):
filter=filter_,
)
# print(encoder.arrays)
encoder.add_gt_field(gt.astype("int32"), gt_phased)
if gt is not None:
encoder.add_gt_field(gt.astype("int32"), gt_phased)
for name, array in info_fields.items():
if array.dtype.kind == "O":
array = array.astype("S")
Expand Down

0 comments on commit 3382a02

Please sign in to comment.