Skip to content

Commit

Permalink
feat: Add C Data integration test shared library (#337)
Browse files Browse the repository at this point in the history
This PR adds the shared library target required by the archery
integration tetster, based on
https://github.com/apache/arrow/blob/main/cpp/src/arrow/integration/c_data_integration_internal.cc
.

I haven't tested this via archery because I have no idea how to do so
(the implementation names and file locations seem hard-coded?); however,
it does add a googletest file with some minimal examples to at least
make sure everything is wired up.

---------

Co-authored-by: Antoine Pitrou <[email protected]>
  • Loading branch information
paleolimbot and pitrou authored Dec 19, 2023
1 parent dfe700f commit 12be163
Show file tree
Hide file tree
Showing 7 changed files with 551 additions and 18 deletions.
36 changes: 24 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ set(NANOARROW_VERSION_MINOR "${nanoarrow_VERSION_MINOR}")
set(NANOARROW_VERSION_PATCH "${nanoarrow_VERSION_PATCH}")

option(NANOARROW_BUILD_TESTS "Build tests" OFF)
option(NANOARROW_BUILD_INTEGRATION_TESTS
"Build cross-implementation Arrow integration tests" OFF)
option(NANOARROW_BUNDLE "Create bundled nanoarrow.h and nanoarrow.c" OFF)
option(NANOARROW_BUNDLE_AS_CPP "Bundle nanoarrow source file as nanoarrow.cc" OFF)
option(NANOARROW_NAMESPACE "A prefix for exported symbols" OFF)
Expand Down Expand Up @@ -151,10 +153,25 @@ else()
DESTINATION include/nanoarrow)
endif()

if(NANOARROW_BUILD_TESTS)
# For testing we use GTest + Arrow C++
# Always build integration test if building tests
if(NANOARROW_BUILD_TESTS OR NANOARROW_BUILD_INTEGRATION_TESTS)
include(FetchContent)

fetchcontent_declare(nlohmann_json
URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.2.zip
URL_HASH SHA256=95651d7d1fcf2e5c3163c3d37df6d6b3e9e5027299e6bd050d157322ceda9ac9
)
fetchcontent_makeavailable(nlohmann_json)

add_library(nanoarrow_c_data_integration SHARED
src/nanoarrow/integration/c_data_integration.cc)
target_include_directories(nanoarrow_c_data_integration
PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
$<INSTALL_INTERFACE:include>)
target_link_libraries(nanoarrow_c_data_integration PRIVATE nanoarrow nlohmann_json)
endif()

if(NANOARROW_BUILD_TESTS)
set(MEMORYCHECK_COMMAND_OPTIONS
"--leak-check=full --suppressions=${CMAKE_CURRENT_LIST_DIR}/valgrind.supp --error-exitcode=1"
)
Expand Down Expand Up @@ -203,23 +220,15 @@ if(NANOARROW_BUILD_TESTS)

fetchcontent_makeavailable(googletest)

# JSON library for integration testing
# Also used by some versions of Arrow, so check if this is already available
if(NOT TARGET nlohmann_json::nlohmann_json)
fetchcontent_declare(nlohmann_json
URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.2.zip
URL_HASH SHA256=95651d7d1fcf2e5c3163c3d37df6d6b3e9e5027299e6bd050d157322ceda9ac9
)
fetchcontent_makeavailable(nlohmann_json)
endif()

add_executable(utils_test src/nanoarrow/utils_test.cc)
add_executable(buffer_test src/nanoarrow/buffer_test.cc)
add_executable(array_test src/nanoarrow/array_test.cc)
add_executable(schema_test src/nanoarrow/schema_test.cc)
add_executable(array_stream_test src/nanoarrow/array_stream_test.cc)
add_executable(nanoarrow_hpp_test src/nanoarrow/nanoarrow_hpp_test.cc)
add_executable(nanoarrow_testing_test src/nanoarrow/nanoarrow_testing_test.cc)
add_executable(c_data_integration_test
src/nanoarrow/integration/c_data_integration_test.cc)

if(NANOARROW_CODE_COVERAGE)
target_compile_options(coverage_config INTERFACE -O0 -g --coverage)
Expand Down Expand Up @@ -250,6 +259,8 @@ if(NANOARROW_BUILD_TESTS)
gtest_main
nlohmann_json::nlohmann_json
coverage_config)
target_link_libraries(c_data_integration_test nanoarrow nanoarrow_c_data_integration
gtest_main)

include(GoogleTest)
# Some users have reported a timeout with the default value of 5
Expand All @@ -262,4 +273,5 @@ if(NANOARROW_BUILD_TESTS)
gtest_discover_tests(array_stream_test DISCOVERY_TIMEOUT 10)
gtest_discover_tests(nanoarrow_hpp_test DISCOVERY_TIMEOUT 10)
gtest_discover_tests(nanoarrow_testing_test DISCOVERY_TIMEOUT 10)
gtest_discover_tests(c_data_integration_test DISCOVERY_TIMEOUT 10)
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ class TestFile {

// Use testing util to populate the array stream
nanoarrow::testing::TestingJSONReader reader;
NANOARROW_RETURN_NOT_OK(reader.ReadDataFile(json_string, out, error));
NANOARROW_RETURN_NOT_OK(reader.ReadDataFile(
json_string, out, nanoarrow::testing::TestingJSONReader::kNumBatchReadAll,
error));
return NANOARROW_OK;
}

Expand Down
224 changes: 224 additions & 0 deletions src/nanoarrow/integration/c_data_integration.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <fstream>
#include <sstream>
#include <string>
#include <vector>

#include <nanoarrow/nanoarrow_testing.hpp>
#include "c_data_integration.h"

static int64_t kBytesAllocated = 0;

static uint8_t* IntegrationTestReallocate(ArrowBufferAllocator* allocator, uint8_t* ptr,
int64_t old_size, int64_t new_size) {
ArrowBufferAllocator default_allocator = ArrowBufferAllocatorDefault();
kBytesAllocated -= old_size;
uint8_t* out =
default_allocator.reallocate(&default_allocator, ptr, old_size, new_size);
if (out != nullptr) {
kBytesAllocated += new_size;
}

return out;
}

static void IntegrationTestFree(struct ArrowBufferAllocator* allocator, uint8_t* ptr,
int64_t size) {
ArrowBufferAllocator default_allocator = ArrowBufferAllocatorDefault();
kBytesAllocated -= size;
default_allocator.free(&default_allocator, ptr, size);
}

static ArrowBufferAllocator IntegrationTestAllocator() {
ArrowBufferAllocator allocator;
allocator.reallocate = &IntegrationTestReallocate;
allocator.free = &IntegrationTestFree;
allocator.private_data = nullptr;
return allocator;
}

static ArrowErrorCode ReadFileString(std::ostream& out, const std::string& file_path) {
std::ifstream infile(file_path, std::ios::in | std::ios::binary);
char buf[8096];
do {
infile.read(buf, sizeof(buf));
out << std::string(buf, infile.gcount());
} while (infile.gcount() > 0);

infile.close();
return NANOARROW_OK;
}

static ArrowErrorCode ArrayStreamFromJsonFilePath(const std::string& json_path,
ArrowArrayStream* out, int num_batch,
ArrowError* error) {
std::stringstream ss;
NANOARROW_RETURN_NOT_OK_WITH_ERROR(ReadFileString(ss, json_path), error);

nanoarrow::testing::TestingJSONReader reader(IntegrationTestAllocator());
NANOARROW_RETURN_NOT_OK(reader.ReadDataFile(ss.str(), out, num_batch, error));
return NANOARROW_OK;
}

struct MaterializedArrayStream {
nanoarrow::UniqueSchema schema;
std::vector<nanoarrow::UniqueArray> arrays;
};

static ArrowErrorCode MaterializeJsonFilePath(const std::string& json_path,
MaterializedArrayStream* out, int num_batch,
ArrowError* error) {
nanoarrow::UniqueArrayStream stream;
NANOARROW_RETURN_NOT_OK(
ArrayStreamFromJsonFilePath(json_path, stream.get(), num_batch, error));

int result = stream->get_schema(stream.get(), out->schema.get());
if (result != NANOARROW_OK) {
const char* err = stream->get_last_error(stream.get());
if (err != nullptr) {
ArrowErrorSet(error, "%s", err);
}
}

nanoarrow::UniqueArray tmp;
do {
tmp.reset();
int result = stream->get_next(stream.get(), tmp.get());
if (result != NANOARROW_OK) {
const char* err = stream->get_last_error(stream.get());
if (err != nullptr) {
ArrowErrorSet(error, "%s", err);
}

return result;
}

if (tmp->release == nullptr) {
break;
}

out->arrays.emplace_back(tmp.get());
} while (true);

return NANOARROW_OK;
}

static ArrowErrorCode ExportSchemaFromJson(const char* json_path, ArrowSchema* out,
ArrowError* error) {
MaterializedArrayStream data;
NANOARROW_RETURN_NOT_OK(MaterializeJsonFilePath(
json_path, &data, nanoarrow::testing::TestingJSONReader::kNumBatchOnlySchema,
error));
ArrowSchemaMove(data.schema.get(), out);
return NANOARROW_OK;
}

static ArrowErrorCode ImportSchemaAndCompareToJson(const char* json_path,
ArrowSchema* schema,
ArrowError* error) {
nanoarrow::UniqueSchema actual(schema);

MaterializedArrayStream data;
NANOARROW_RETURN_NOT_OK(MaterializeJsonFilePath(
json_path, &data, nanoarrow::testing::TestingJSONReader::kNumBatchOnlySchema,
error));

nanoarrow::testing::TestingJSONComparison comparison;
NANOARROW_RETURN_NOT_OK(
comparison.CompareSchema(actual.get(), data.schema.get(), error));
if (comparison.num_differences() > 0) {
std::stringstream ss;
comparison.WriteDifferences(ss);
ArrowErrorSet(error, "Found %d differences:\n%s",
static_cast<int>(comparison.num_differences()), ss.str().c_str());
return EINVAL;
}

return NANOARROW_OK;
}

static ArrowErrorCode ExportBatchFromJson(const char* json_path, int num_batch,
ArrowArray* out, ArrowError* error) {
MaterializedArrayStream data;
NANOARROW_RETURN_NOT_OK(MaterializeJsonFilePath(json_path, &data, num_batch, error));

ArrowArrayMove(data.arrays[num_batch].get(), out);
return NANOARROW_OK;
}

static ArrowErrorCode ImportBatchAndCompareToJson(const char* json_path, int num_batch,
ArrowArray* batch, ArrowError* error) {
nanoarrow::UniqueArray actual(batch);

MaterializedArrayStream data;
NANOARROW_RETURN_NOT_OK(MaterializeJsonFilePath(json_path, &data, num_batch, error));

nanoarrow::testing::TestingJSONComparison comparison;
NANOARROW_RETURN_NOT_OK(comparison.SetSchema(data.schema.get(), error));
NANOARROW_RETURN_NOT_OK(
comparison.CompareBatch(actual.get(), data.arrays[num_batch].get(), error));
if (comparison.num_differences() > 0) {
std::stringstream ss;
comparison.WriteDifferences(ss);
ArrowErrorSet(error, "Found %d differences:\n%s",
static_cast<int>(comparison.num_differences()), ss.str().c_str());
return EINVAL;
}

return NANOARROW_OK;
}

static ArrowError global_error;

static const char* ConvertError(ArrowErrorCode errno_code) {
if (errno_code == NANOARROW_OK) {
return nullptr;
} else {
return global_error.message;
}
}

int64_t nanoarrow_BytesAllocated() { return kBytesAllocated; }

const char* nanoarrow_CDataIntegration_ExportSchemaFromJson(const char* json_path,
ArrowSchema* out) {
ArrowErrorInit(&global_error);
return ConvertError(ExportSchemaFromJson(json_path, out, &global_error));
}

const char* nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson(const char* json_path,
ArrowSchema* schema) {
ArrowErrorInit(&global_error);
return ConvertError(ImportSchemaAndCompareToJson(json_path, schema, &global_error));
}

const char* nanoarrow_CDataIntegration_ExportBatchFromJson(const char* json_path,
int num_batch,
ArrowArray* out) {
ArrowErrorInit(&global_error);
return ConvertError(ExportBatchFromJson(json_path, num_batch, out, &global_error));
}

const char* nanoarrow_CDataIntegration_ImportBatchAndCompareToJson(const char* json_path,
int num_batch,
ArrowArray* batch) {
ArrowErrorInit(&global_error);
return ConvertError(
ImportBatchAndCompareToJson(json_path, num_batch, batch, &global_error));
}
Loading

0 comments on commit 12be163

Please sign in to comment.