-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Split out model and dataset creation into conftest
- Loading branch information
1 parent
d7c0779
commit 221c35c
Showing
3 changed files
with
302 additions
and
270 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import os | ||
|
||
import hugectr | ||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
|
||
from merlin.io import Dataset | ||
from merlin.schema import Tags | ||
from merlin.transforms import Workflow | ||
from merlin.transforms.ops import AddTags, Categorify | ||
|
||
|
||
@pytest.fixture | ||
def hugectr_example_dataset(tmpdir): | ||
num_rows = 64 | ||
|
||
df = pd.DataFrame( | ||
{ | ||
"a": np.arange(num_rows).astype(np.int64), | ||
"b": np.arange(num_rows).astype(np.int64), | ||
"c": np.arange(num_rows).astype(np.int64), | ||
"d": np.random.rand(num_rows).astype(np.float32), | ||
"label": np.array([0] * num_rows).astype(np.float32), | ||
}, | ||
) | ||
categorical_columns = ["a", "b", "c"] | ||
dense_columns = ["d"] | ||
target_columns = ["label"] | ||
|
||
workflow = Workflow( | ||
(categorical_columns >> Categorify()) | ||
+ (dense_columns >> AddTags(Tags.CONTINUOUS)) | ||
+ (target_columns >> AddTags(Tags.TARGET)) | ||
) | ||
|
||
dataset = workflow.fit_transform(Dataset(df)) | ||
|
||
return dataset | ||
|
||
|
||
@pytest.fixture | ||
def hugectr_example_model(hugectr_example_dataset, tmpdir): | ||
dataset = hugectr_example_dataset | ||
|
||
train_path = os.path.join(tmpdir, "hugectr_example_data/") | ||
os.mkdir(train_path) | ||
|
||
dataset.to_parquet( | ||
output_path=tmpdir, | ||
cats=dataset.schema.select_by_tag(Tags.CATEGORICAL).column_names, | ||
conts=dataset.schema.select_by_tag(Tags.CONTINUOUS).column_names, | ||
labels=dataset.schema.select_by_tag(Tags.TARGET).column_names, | ||
) | ||
|
||
# slot_sizes = list of caridinalities per column | ||
slot_sizes = [ | ||
col.properties["embedding_sizes"]["cardinality"] | ||
for col in dataset.schema.select_by_tag(Tags.CATEGORICAL) | ||
] | ||
|
||
# dense_dim = num of dense inputs | ||
dense_dim = len(dataset.schema.select_by_tag(Tags.CONTINUOUS)) | ||
|
||
solver = hugectr.CreateSolver( | ||
vvgpu=[[0]], | ||
batchsize=10, | ||
batchsize_eval=10, | ||
max_eval_batches=50, | ||
i64_input_key=True, | ||
use_mixed_precision=False, | ||
repeat_dataset=True, | ||
) | ||
# https://github.com/NVIDIA-Merlin/HugeCTR/blob/9e648f879166fc93931c676a5594718f70178a92/docs/source/api/python_interface.md#datareaderparams | ||
reader = hugectr.DataReaderParams( | ||
data_reader_type=hugectr.DataReaderType_t.Parquet, | ||
source=[os.path.join(train_path, "_file_list.txt")], | ||
eval_source=os.path.join(train_path, "_file_list.txt"), | ||
check_type=hugectr.Check_t.Non, | ||
) | ||
|
||
optimizer = hugectr.CreateOptimizer(optimizer_type=hugectr.Optimizer_t.Adam) | ||
model = hugectr.Model(solver, reader, optimizer) | ||
|
||
model.add( | ||
hugectr.Input( | ||
label_dim=1, | ||
label_name="label", | ||
dense_dim=dense_dim, | ||
dense_name="dense", | ||
data_reader_sparse_param_array=[ | ||
hugectr.DataReaderSparseParam("data1", len(slot_sizes) + 1, True, len(slot_sizes)) | ||
], | ||
) | ||
) | ||
model.add( | ||
hugectr.SparseEmbedding( | ||
embedding_type=hugectr.Embedding_t.DistributedSlotSparseEmbeddingHash, | ||
workspace_size_per_gpu_in_mb=107, | ||
embedding_vec_size=16, | ||
combiner="sum", | ||
sparse_embedding_name="sparse_embedding1", | ||
bottom_name="data1", | ||
slot_size_array=slot_sizes, | ||
optimizer=optimizer, | ||
) | ||
) | ||
model.add( | ||
hugectr.DenseLayer( | ||
layer_type=hugectr.Layer_t.InnerProduct, | ||
bottom_names=["dense"], | ||
top_names=["fc1"], | ||
num_output=512, | ||
) | ||
) | ||
model.add( | ||
hugectr.DenseLayer( | ||
layer_type=hugectr.Layer_t.Reshape, | ||
bottom_names=["sparse_embedding1"], | ||
top_names=["reshape1"], | ||
leading_dim=48, | ||
) | ||
) | ||
model.add( | ||
hugectr.DenseLayer( | ||
layer_type=hugectr.Layer_t.InnerProduct, | ||
bottom_names=["reshape1", "fc1"], | ||
top_names=["fc2"], | ||
num_output=1, | ||
) | ||
) | ||
model.add( | ||
hugectr.DenseLayer( | ||
layer_type=hugectr.Layer_t.BinaryCrossEntropyLoss, | ||
bottom_names=["fc2", "label"], | ||
top_names=["loss"], | ||
) | ||
) | ||
model.compile() | ||
model.summary() | ||
model.fit(max_iter=20, display=100, eval_interval=200, snapshot=10) | ||
|
||
return model |
Oops, something went wrong.