Skip to content

Commit

Permalink
Split out model and dataset creation into conftest
Browse files Browse the repository at this point in the history
  • Loading branch information
oliverholworthy committed Aug 23, 2022
1 parent d7c0779 commit 221c35c
Show file tree
Hide file tree
Showing 3 changed files with 302 additions and 270 deletions.
143 changes: 143 additions & 0 deletions tests/unit/systems/ops/hugectr/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import os

import hugectr
import numpy as np
import pandas as pd
import pytest

from merlin.io import Dataset
from merlin.schema import Tags
from merlin.transforms import Workflow
from merlin.transforms.ops import AddTags, Categorify


@pytest.fixture
def hugectr_example_dataset(tmpdir):
num_rows = 64

df = pd.DataFrame(
{
"a": np.arange(num_rows).astype(np.int64),
"b": np.arange(num_rows).astype(np.int64),
"c": np.arange(num_rows).astype(np.int64),
"d": np.random.rand(num_rows).astype(np.float32),
"label": np.array([0] * num_rows).astype(np.float32),
},
)
categorical_columns = ["a", "b", "c"]
dense_columns = ["d"]
target_columns = ["label"]

workflow = Workflow(
(categorical_columns >> Categorify())
+ (dense_columns >> AddTags(Tags.CONTINUOUS))
+ (target_columns >> AddTags(Tags.TARGET))
)

dataset = workflow.fit_transform(Dataset(df))

return dataset


@pytest.fixture
def hugectr_example_model(hugectr_example_dataset, tmpdir):
dataset = hugectr_example_dataset

train_path = os.path.join(tmpdir, "hugectr_example_data/")
os.mkdir(train_path)

dataset.to_parquet(
output_path=tmpdir,
cats=dataset.schema.select_by_tag(Tags.CATEGORICAL).column_names,
conts=dataset.schema.select_by_tag(Tags.CONTINUOUS).column_names,
labels=dataset.schema.select_by_tag(Tags.TARGET).column_names,
)

# slot_sizes = list of caridinalities per column
slot_sizes = [
col.properties["embedding_sizes"]["cardinality"]
for col in dataset.schema.select_by_tag(Tags.CATEGORICAL)
]

# dense_dim = num of dense inputs
dense_dim = len(dataset.schema.select_by_tag(Tags.CONTINUOUS))

solver = hugectr.CreateSolver(
vvgpu=[[0]],
batchsize=10,
batchsize_eval=10,
max_eval_batches=50,
i64_input_key=True,
use_mixed_precision=False,
repeat_dataset=True,
)
# https://github.com/NVIDIA-Merlin/HugeCTR/blob/9e648f879166fc93931c676a5594718f70178a92/docs/source/api/python_interface.md#datareaderparams
reader = hugectr.DataReaderParams(
data_reader_type=hugectr.DataReaderType_t.Parquet,
source=[os.path.join(train_path, "_file_list.txt")],
eval_source=os.path.join(train_path, "_file_list.txt"),
check_type=hugectr.Check_t.Non,
)

optimizer = hugectr.CreateOptimizer(optimizer_type=hugectr.Optimizer_t.Adam)
model = hugectr.Model(solver, reader, optimizer)

model.add(
hugectr.Input(
label_dim=1,
label_name="label",
dense_dim=dense_dim,
dense_name="dense",
data_reader_sparse_param_array=[
hugectr.DataReaderSparseParam("data1", len(slot_sizes) + 1, True, len(slot_sizes))
],
)
)
model.add(
hugectr.SparseEmbedding(
embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash,
workspace_size_per_gpu_in_mb=107,
embedding_vec_size=16,
combiner="sum",
sparse_embedding_name="sparse_embedding1",
bottom_name="data1",
slot_size_array=slot_sizes,
optimizer=optimizer,
)
)
model.add(
hugectr.DenseLayer(
layer_type=hugectr.Layer_t.InnerProduct,
bottom_names=["dense"],
top_names=["fc1"],
num_output=512,
)
)
model.add(
hugectr.DenseLayer(
layer_type=hugectr.Layer_t.Reshape,
bottom_names=["sparse_embedding1"],
top_names=["reshape1"],
leading_dim=48,
)
)
model.add(
hugectr.DenseLayer(
layer_type=hugectr.Layer_t.InnerProduct,
bottom_names=["reshape1", "fc1"],
top_names=["fc2"],
num_output=1,
)
)
model.add(
hugectr.DenseLayer(
layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss,
bottom_names=["fc2", "label"],
top_names=["loss"],
)
)
model.compile()
model.summary()
model.fit(max_iter=20, display=100, eval_interval=200, snapshot=10)

return model
Loading

0 comments on commit 221c35c

Please sign in to comment.