diff --git a/deepctr/__init__.py b/deepctr/__init__.py index c5e03447..f4a99163 100644 --- a/deepctr/__init__.py +++ b/deepctr/__init__.py @@ -3,5 +3,5 @@ from .import sequence from . import models from .utils import check_version -__version__ = '0.2.1' +__version__ = '0.2.2' check_version(__version__) diff --git a/deepctr/input_embedding.py b/deepctr/input_embedding.py new file mode 100644 index 00000000..7c7977db --- /dev/null +++ b/deepctr/input_embedding.py @@ -0,0 +1,164 @@ +from itertools import chain + +from tensorflow.python.keras import Input +from tensorflow.python.keras.initializers import RandomNormal +from tensorflow.python.keras.layers import Embedding, Dense, Reshape, Concatenate +from tensorflow.python.keras.regularizers import l2 +from .sequence import SequencePoolingLayer +from .utils import get_linear_logit + + +def create_input_dict(feature_dim_dict, prefix=''): + sparse_input = {feat: Input(shape=(1,), name=prefix+'sparse_' + str(i) + '-' + feat) for i, feat in + enumerate(feature_dim_dict["sparse"])} + dense_input = {feat: Input(shape=(1,), name=prefix+'dense_' + str(i) + '-' + feat) for i, feat in + enumerate(feature_dim_dict["dense"])} + return sparse_input, dense_input + + +def create_sequence_input_dict(feature_dim_dict): + + sequence_dim_dict = feature_dim_dict.get('sequence', []) + sequence_input_dict = {feat.name: Input(shape=(feat.maxlen,), name='seq_' + str( + i) + '-' + feat.name) for i, feat in enumerate(sequence_dim_dict)} + sequence_pooling_dict = {feat.name: feat.combiner + for i, feat in enumerate(sequence_dim_dict)} + sequence_len_dict = {feat.name: Input(shape=( + 1,), name='seq_length'+str(i)+'-'+feat.name) for i, feat in enumerate(sequence_dim_dict)} + sequence_max_len_dict = {feat.name: feat.maxlen + for i, feat in enumerate(sequence_dim_dict)} + return sequence_input_dict, sequence_pooling_dict, sequence_len_dict, sequence_max_len_dict + + +def create_embedding_dict(feature_dim_dict, embedding_size, init_std, seed, l2_reg, prefix='sparse'): + if embedding_size == 'auto': + + sparse_embedding = {feat: Embedding(feature_dim_dict["sparse"][feat], 6 * int(pow(feature_dim_dict["sparse"][feat], 0.25)), + embeddings_initializer=RandomNormal( + mean=0.0, stddev=init_std, seed=seed), + embeddings_regularizer=l2(l2_reg), + name=prefix+'_emb_' + str(i) + '-' + feat) for i, feat in + enumerate(feature_dim_dict["sparse"])} + else: + + sparse_embedding = {feat: Embedding(feature_dim_dict["sparse"][feat], embedding_size, + embeddings_initializer=RandomNormal( + mean=0.0, stddev=init_std, seed=seed), + embeddings_regularizer=l2(l2_reg), + name=prefix+'_emb_' + str(i) + '-' + feat) for i, feat in + enumerate(feature_dim_dict["sparse"])} + + if 'sequence' in feature_dim_dict: + count = len(sparse_embedding) + sequence_dim_list = feature_dim_dict['sequence'] + for feat in sequence_dim_list: + if feat.name not in sparse_embedding: + if embedding_size == "auto": + sparse_embedding[feat.name] = Embedding(feat.dimension, 6 * int(pow(feat.dimension, 0.25)), + embeddings_initializer=RandomNormal( + mean=0.0, stddev=init_std, seed=seed), + embeddings_regularizer=l2( + l2_reg), + name=prefix + '_emb_' + str(count) + '-' + feat.name) + + else: + sparse_embedding[feat.name] = Embedding(feat.dimension, embedding_size, + embeddings_initializer=RandomNormal( + mean=0.0, stddev=init_std, seed=seed), + embeddings_regularizer=l2( + l2_reg), + name=prefix+'_emb_' + str(count) + '-' + feat.name) + + count += 1 + + return sparse_embedding + + +def merge_dense_input(dense_input_, embed_list, embedding_size, l2_reg): + dense_input = list(dense_input_.values()) + if len(dense_input) > 0: + if embedding_size == "auto": + if len(dense_input) == 1: + continuous_embedding_list = dense_input[0] + else: + continuous_embedding_list = Concatenate()(dense_input) + continuous_embedding_list = Reshape( + [1, len(dense_input)])(continuous_embedding_list) + embed_list.append(continuous_embedding_list) + + else: + continuous_embedding_list = list( + map(Dense(embedding_size, use_bias=False, kernel_regularizer=l2(l2_reg), ), + dense_input)) + continuous_embedding_list = list( + map(Reshape((1, embedding_size)), continuous_embedding_list)) + embed_list += continuous_embedding_list + + return embed_list + + +def merge_sequence_input(embedding_dict, embed_list, sequence_input_dict, sequence_len_dict, sequence_max_len_dict, sequence_pooling_dict): + if len(sequence_input_dict) > 0: + sequence_embed_dict = get_varlen_embedding_vec_dict( + embedding_dict, sequence_input_dict) + sequence_embed_list = get_pooling_vec_list( + sequence_embed_dict, sequence_len_dict, sequence_max_len_dict, sequence_pooling_dict) + embed_list += sequence_embed_list + + return embed_list + + +def get_embedding_vec_list(embedding_dict, input_dict): + + return [embedding_dict[feat](v) + for feat, v in input_dict.items()] + + +def get_varlen_embedding_vec_dict(embedding_dict, input_dict): + + return {feat: embedding_dict[feat](v) + for feat, v in input_dict.items()} + + +def get_pooling_vec_list(sequence_embed_dict, sequence_len_dict, sequence_max_len_dict, sequence_pooling_dict): + return [SequencePoolingLayer(sequence_max_len_dict[feat], sequence_pooling_dict[feat])( + [v, sequence_len_dict[feat]]) for feat, v in sequence_embed_dict.items()] + + +def get_inputs_list(inputs): + return list(chain(*list(map(lambda x: x.values(), inputs)))) + + +def get_inputs_embedding(feature_dim_dict, embedding_size, l2_reg_embedding, l2_reg_linear, init_std, seed, include_linear=True): + sparse_input_dict, dense_input_dict = create_input_dict(feature_dim_dict) + sequence_input_dict, sequence_pooling_dict, sequence_input_len_dict, sequence_max_len_dict = create_sequence_input_dict( + feature_dim_dict) + + deep_sparse_emb_dict = create_embedding_dict( + feature_dim_dict, embedding_size, init_std, seed, l2_reg_embedding) + + deep_emb_list = get_embedding_vec_list( + deep_sparse_emb_dict, sparse_input_dict) + + deep_emb_list = merge_sequence_input(deep_sparse_emb_dict, deep_emb_list, sequence_input_dict, + sequence_input_len_dict, sequence_max_len_dict, sequence_pooling_dict) + + deep_emb_list = merge_dense_input( + dense_input_dict, deep_emb_list, embedding_size, l2_reg_embedding) + if include_linear: + linear_sparse_emb_dict = create_embedding_dict( + feature_dim_dict, 1, init_std, seed, l2_reg_linear, 'linear') + linear_emb_list = get_embedding_vec_list( + linear_sparse_emb_dict, sparse_input_dict) + linear_emb_list = merge_sequence_input(linear_sparse_emb_dict, linear_emb_list, sequence_input_dict, + sequence_input_len_dict, + sequence_max_len_dict, sequence_pooling_dict) + + linear_logit = get_linear_logit( + linear_emb_list, dense_input_dict, l2_reg_linear) + else: + linear_logit = None + + inputs_list = get_inputs_list( + [sparse_input_dict, dense_input_dict, sequence_input_dict, sequence_input_len_dict]) + return deep_emb_list, linear_logit, inputs_list diff --git a/deepctr/models/afm.py b/deepctr/models/afm.py index acb15efa..5beec8b4 100644 --- a/deepctr/models/afm.py +++ b/deepctr/models/afm.py @@ -9,13 +9,10 @@ (https://arxiv.org/abs/1708.04617) """ - -from tensorflow.python.keras.layers import Dense, Concatenate, Reshape, add -from tensorflow.python.keras.models import Model -from tensorflow.python.keras.regularizers import l2 - -from ..utils import get_input, get_share_embeddings +import tensorflow as tf +from ..input_embedding import get_inputs_embedding from ..layers import PredictionLayer, AFMLayer, FM +from ..utils import concat_fun def AFM(feature_dim_dict, embedding_size=8, use_attention=True, attention_factor=8, @@ -48,41 +45,18 @@ def AFM(feature_dim_dict, embedding_size=8, use_attention=True, attention_factor raise ValueError("feature_dim_dict['dense'] must be a list,cur is", type( feature_dim_dict['dense'])) - sparse_input, dense_input = get_input(feature_dim_dict, None) - sparse_embedding, linear_embedding, = get_share_embeddings( - feature_dim_dict, embedding_size, init_std, seed, l2_reg_embedding, l2_reg_linear) - - embed_list = [sparse_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] - linear_term = [linear_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] - if len(linear_term) > 1: - linear_term = add(linear_term) - elif len(linear_term) == 1: - linear_term = linear_term[0] + deep_emb_list, linear_logit, inputs_list = get_inputs_embedding( + feature_dim_dict, embedding_size, l2_reg_embedding, l2_reg_linear, init_std, seed) - if len(dense_input) > 0: - continuous_embedding_list = list( - map(Dense(embedding_size, use_bias=False, kernel_regularizer=l2(l2_reg_embedding), ), - dense_input)) - continuous_embedding_list = list( - map(Reshape((1, embedding_size)), continuous_embedding_list)) - embed_list += continuous_embedding_list - - dense_input_ = dense_input[0] if len( - dense_input) == 1 else Concatenate()(dense_input) - linear_dense_logit = Dense( - 1, activation=None, use_bias=False, kernel_regularizer=l2(l2_reg_linear))(dense_input_) - linear_term = add([linear_dense_logit, linear_term]) - - fm_input = Concatenate(axis=1)(embed_list) + fm_input = concat_fun(deep_emb_list,axis=1) if use_attention: - fm_out = AFMLayer(attention_factor, l2_reg_att, - keep_prob, seed)(embed_list) + fm_logit = AFMLayer(attention_factor, l2_reg_att, + keep_prob, seed)(deep_emb_list) else: - fm_out = FM()(fm_input) + fm_logit = FM()(fm_input) - final_logit = add([linear_term, fm_out]) + final_logit = tf.keras.layers.add([linear_logit, fm_logit]) output = PredictionLayer(final_activation)(final_logit) - model = Model(inputs=sparse_input + dense_input, outputs=output) + + model = tf.keras.models.Model(inputs=inputs_list, outputs=output) return model diff --git a/deepctr/models/autoint.py b/deepctr/models/autoint.py index c98b4fa9..08b1fc26 100644 --- a/deepctr/models/autoint.py +++ b/deepctr/models/autoint.py @@ -9,14 +9,10 @@ """ -from tensorflow.python.keras.layers import Dense, Embedding, Concatenate -from tensorflow.python.keras.models import Model -from tensorflow.python.keras.initializers import RandomNormal -from tensorflow.python.keras.regularizers import l2 import tensorflow as tf - -from ..utils import get_input +from ..input_embedding import get_inputs_embedding from ..layers import PredictionLayer, MLP, InteractingLayer +from ..utils import concat_fun def AutoInt(feature_dim_dict, embedding_size=8, att_layer_num=3, att_embedding_size=8, att_head_num=2, att_res=True, hidden_size=(256, 256), activation='relu', @@ -48,56 +44,37 @@ def AutoInt(feature_dim_dict, embedding_size=8, att_layer_num=3, att_embedding_s raise ValueError( "feature_dim must be a dict like {'sparse':{'field_1':4,'field_2':3,'field_3':2},'dense':['field_5',]}") - sparse_input, dense_input = get_input(feature_dim_dict, None,) - sparse_embedding = get_embeddings( - feature_dim_dict, embedding_size, init_std, seed, l2_reg_embedding) - embed_list = [sparse_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] + deep_emb_list, _, inputs_list = get_inputs_embedding( + feature_dim_dict, embedding_size, l2_reg_embedding, 0, init_std, seed, False) - att_input = Concatenate(axis=1)(embed_list) if len( - embed_list) > 1 else embed_list[0] + att_input = concat_fun(deep_emb_list, axis=1) - for i in range(att_layer_num): + for _ in range(att_layer_num): att_input = InteractingLayer( att_embedding_size, att_head_num, att_res)(att_input) att_output = tf.keras.layers.Flatten()(att_input) - deep_input = tf.keras.layers.Flatten()(Concatenate()(embed_list) - if len(embed_list) > 1 else embed_list[0]) - if len(dense_input) > 0: - if len(dense_input) == 1: - continuous_list = dense_input[0] - else: - continuous_list = Concatenate()(dense_input) - - deep_input = Concatenate()([deep_input, continuous_list]) + deep_input = tf.keras.layers.Flatten()(concat_fun(deep_emb_list)) if len(hidden_size) > 0 and att_layer_num > 0: # Deep & Interacting Layer deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, use_bn, seed)(deep_input) - stack_out = Concatenate()([att_output, deep_out]) - final_logit = Dense(1, use_bias=False, activation=None)(stack_out) + stack_out = tf.keras.layers.Concatenate()([att_output, deep_out]) + final_logit = tf.keras.layers.Dense( + 1, use_bias=False, activation=None)(stack_out) elif len(hidden_size) > 0: # Only Deep deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, use_bn, seed)(deep_input) - final_logit = Dense(1, use_bias=False, activation=None)(deep_out) + final_logit = tf.keras.layers.Dense( + 1, use_bias=False, activation=None)(deep_out) elif att_layer_num > 0: # Only Interacting Layer - final_logit = Dense(1, use_bias=False, activation=None)(att_output) + final_logit = tf.keras.layers.Dense( + 1, use_bias=False, activation=None)(att_output) else: # Error raise NotImplementedError output = PredictionLayer(final_activation)(final_logit) - model = Model(inputs=sparse_input + dense_input, outputs=output) - - return model - -def get_embeddings(feature_dim_dict, embedding_size, init_std, seed, l2_rev_V): - sparse_embedding = [Embedding(feature_dim_dict["sparse"][feat], embedding_size, - embeddings_initializer=RandomNormal( - mean=0.0, stddev=init_std, seed=seed), - embeddings_regularizer=l2(l2_rev_V), - name='sparse_emb_' + str(i) + '-' + feat) for i, feat in - enumerate(feature_dim_dict["sparse"])] + model = tf.keras.models.Model(inputs=inputs_list, outputs=output) - return sparse_embedding + return model diff --git a/deepctr/models/dcn.py b/deepctr/models/dcn.py index b01c12b6..7fe0ea72 100644 --- a/deepctr/models/dcn.py +++ b/deepctr/models/dcn.py @@ -6,13 +6,11 @@ Reference: [1] Wang R, Fu B, Fu G, et al. Deep & cross network for ad click predictions[C]//Proceedings of the ADKDD'17. ACM, 2017: 12. (https://arxiv.org/abs/1708.05123) """ -from tensorflow.python.keras.layers import Dense, Embedding, Concatenate, Flatten -from tensorflow.python.keras.models import Model -from tensorflow.python.keras.initializers import RandomNormal -from tensorflow.python.keras.regularizers import l2 +import tensorflow as tf -from ..utils import get_input +from ..input_embedding import * from ..layers import CrossNet, PredictionLayer, MLP +from ..utils import concat_fun def DCN(feature_dim_dict, embedding_size='auto', @@ -43,27 +41,16 @@ def DCN(feature_dim_dict, embedding_size='auto', raise ValueError( "feature_dim must be a dict like {'sparse':{'field_1':4,'field_2':3,'field_3':2},'dense':['field_5',]}") - sparse_input, dense_input = get_input(feature_dim_dict, None,) - sparse_embedding = get_embeddings( - feature_dim_dict, embedding_size, init_std, seed, l2_reg_embedding) - embed_list = [sparse_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] + deep_emb_list, _, inputs_list = get_inputs_embedding( + feature_dim_dict, embedding_size, l2_reg_embedding, 0, init_std, seed, False) - deep_input = Flatten()(Concatenate()(embed_list) - if len(embed_list) > 1 else embed_list[0]) - if len(dense_input) > 0: - if len(dense_input) == 1: - continuous_list = dense_input[0] - else: - continuous_list = Concatenate()(dense_input) - - deep_input = Concatenate()([deep_input, continuous_list]) + deep_input = tf.keras.layers.Flatten()(concat_fun(deep_emb_list)) if len(hidden_size) > 0 and cross_num > 0: # Deep & Cross deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, use_bn, seed)(deep_input) cross_out = CrossNet(cross_num, l2_reg=l2_reg_cross)(deep_input) - stack_out = Concatenate()([cross_out, deep_out]) + stack_out = tf.keras.layers.Concatenate()([cross_out, deep_out]) final_logit = Dense(1, use_bias=False, activation=None)(stack_out) elif len(hidden_size) > 0: # Only Deep deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, @@ -75,29 +62,8 @@ def DCN(feature_dim_dict, embedding_size='auto', else: # Error raise NotImplementedError - # Activation(self.final_activation)(final_logit) output = PredictionLayer(final_activation)(final_logit) - model = Model(inputs=sparse_input + dense_input, outputs=output) - - return model + model = tf.keras.models.Model(inputs=inputs_list, outputs=output) -def get_embeddings(feature_dim_dict, embedding_size, init_std, seed, l2_rev_V): - if embedding_size == "auto": - sparse_embedding = [Embedding(feature_dim_dict["sparse"][feat], 6*int(pow(feature_dim_dict["sparse"][feat], 0.25)), - embeddings_initializer=RandomNormal( - mean=0.0, stddev=init_std, seed=seed), - embeddings_regularizer=l2(l2_rev_V), name='sparse_emb_' + str(i) + '-'+feat) for i, feat in - enumerate(feature_dim_dict["sparse"])] - - print("Using auto embedding size,the connected vector dimension is", sum( - [6*int(pow(feature_dim_dict["sparse"][k], 0.25)) for k, v in feature_dim_dict["sparse"].items()])) - else: - sparse_embedding = [Embedding(feature_dim_dict["sparse"][feat], embedding_size, - embeddings_initializer=RandomNormal( - mean=0.0, stddev=init_std, seed=seed), - embeddings_regularizer=l2(l2_rev_V), - name='sparse_emb_' + str(i) + '-' + feat) for i, feat in - enumerate(feature_dim_dict["sparse"])] - - return sparse_embedding + return model diff --git a/deepctr/models/deepfm.py b/deepctr/models/deepfm.py index a91bf483..21528bc9 100644 --- a/deepctr/models/deepfm.py +++ b/deepctr/models/deepfm.py @@ -8,11 +8,10 @@ """ -from tensorflow.python.keras.layers import Dense, Concatenate, Reshape, Flatten, add -from tensorflow.python.keras.models import Model -from tensorflow.python.keras.regularizers import l2 -from ..utils import get_input, get_share_embeddings +import tensorflow as tf +from ..input_embedding import get_inputs_embedding from ..layers import PredictionLayer, MLP, FM +from ..utils import concat_fun def DeepFM(feature_dim_dict, embedding_size=8, @@ -46,51 +45,28 @@ def DeepFM(feature_dim_dict, embedding_size=8, raise ValueError("feature_dim_dict['dense'] must be a list,cur is", type( feature_dim_dict['dense'])) - sparse_input, dense_input = get_input(feature_dim_dict, None) - sparse_embedding, linear_embedding, = get_share_embeddings( - feature_dim_dict, embedding_size, init_std, seed, l2_reg_embedding, l2_reg_linear) + deep_emb_list, linear_logit, inputs_list = get_inputs_embedding( + feature_dim_dict, embedding_size, l2_reg_embedding, l2_reg_linear, init_std, seed) - embed_list = [sparse_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] - linear_term = [linear_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] - if len(linear_term) > 1: - linear_term = add(linear_term) - elif len(linear_term) == 1: - linear_term = linear_term[0] - - if len(dense_input) > 0: - continuous_embedding_list = list( - map(Dense(embedding_size, use_bias=False, kernel_regularizer=l2(l2_reg_embedding), ), - dense_input)) - continuous_embedding_list = list( - map(Reshape((1, embedding_size)), continuous_embedding_list)) - embed_list += continuous_embedding_list - - dense_input_ = dense_input[0] if len( - dense_input) == 1 else Concatenate()(dense_input) - linear_dense_logit = Dense( - 1, activation=None, use_bias=False, kernel_regularizer=l2(l2_reg_linear))(dense_input_) - linear_term = add([linear_dense_logit, linear_term]) - - fm_input = Concatenate(axis=1)(embed_list) - deep_input = Flatten()(fm_input) + fm_input = concat_fun(deep_emb_list,axis=1) + deep_input = tf.keras.layers.Flatten()(fm_input) fm_out = FM()(fm_input) deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, use_bn, seed)(deep_input) - deep_logit = Dense(1, use_bias=False, activation=None)(deep_out) + deep_logit = tf.keras.layers.Dense( + 1, use_bias=False, activation=None)(deep_out) if len(hidden_size) == 0 and use_fm == False: # only linear - final_logit = linear_term + final_logit = linear_logit elif len(hidden_size) == 0 and use_fm == True: # linear + FM - final_logit = add([linear_term, fm_out]) + final_logit = tf.keras.layers.add([linear_logit, fm_out]) elif len(hidden_size) > 0 and use_fm == False: # linear + Deep - final_logit = add([linear_term, deep_logit]) + final_logit = tf.keras.layers.add([linear_logit, deep_logit]) elif len(hidden_size) > 0 and use_fm == True: # linear + FM + Deep - final_logit = add([linear_term, fm_out, deep_logit]) + final_logit = tf.keras.layers.add([linear_logit, fm_out, deep_logit]) else: raise NotImplementedError output = PredictionLayer(final_activation)(final_logit) - model = Model(inputs=sparse_input + dense_input, outputs=output) + model = tf.keras.models.Model(inputs=inputs_list, outputs=output) return model diff --git a/deepctr/models/din.py b/deepctr/models/din.py index 223169a0..bb4017b4 100644 --- a/deepctr/models/din.py +++ b/deepctr/models/din.py @@ -15,6 +15,7 @@ from ..layers import MLP from ..sequence import SequencePoolingLayer, AttentionSequencePoolingLayer from ..activations import Dice +from ..utils import concat_fun def get_input(feature_dim_dict, seq_feature_list, seq_max_len): @@ -77,12 +78,9 @@ def DIN(feature_dim_dict, seq_feature_list, embedding_size=8, hist_len_max=16, deep_input_emb_list = [sparse_embedding_dict[feat]( sparse_input[feat]) for feat in feature_dim_dict["sparse"]] - query_emb = Concatenate()(query_emb_list) if len( - query_emb_list) > 1 else query_emb_list[0] - keys_emb = Concatenate()(keys_emb_list) if len( - keys_emb_list) > 1 else keys_emb_list[0] - deep_input_emb = Concatenate()(deep_input_emb_list) if len( - deep_input_emb_list) > 1 else deep_input_emb_list[0] + query_emb = concat_fun(query_emb_list) + keys_emb = concat_fun(keys_emb_list) + deep_input_emb = concat_fun(deep_input_emb_list) if use_din: hist = AttentionSequencePoolingLayer(att_hidden_size, att_activation, weight_normalization=att_weight_normalization)([ diff --git a/deepctr/models/fnn.py b/deepctr/models/fnn.py index 92295dbb..0ea5bd55 100644 --- a/deepctr/models/fnn.py +++ b/deepctr/models/fnn.py @@ -6,13 +6,11 @@ Reference: [1] Zhang W, Du T, Wang J. Deep learning over multi-field categorical data[C]//European conference on information retrieval. Springer, Cham, 2016: 45-57.(https://arxiv.org/pdf/1601.02376.pdf) """ - -from tensorflow.python.keras.layers import Dense, Concatenate, Reshape, add -from tensorflow.python.keras.models import Model -from tensorflow.python.keras.regularizers import l2 +import tensorflow as tf from ..layers import PredictionLayer, MLP -from ..utils import get_input, get_share_embeddings +from ..input_embedding import get_inputs_embedding +from ..utils import concat_fun def FNN(feature_dim_dict, embedding_size=8, @@ -40,42 +38,17 @@ def FNN(feature_dim_dict, embedding_size=8, raise ValueError( "feature_dim must be a dict like {'sparse':{'field_1':4,'field_2':3,'field_3':2},'dense':['field_5',]}") - sparse_input, dense_input = get_input(feature_dim_dict, None) - sparse_embedding, linear_embedding, = get_share_embeddings(feature_dim_dict, embedding_size, init_std, seed, l2_reg_embedding, - l2_reg_linear) - - embed_list = [sparse_embedding[i](sparse_input[i]) - for i in range(len(feature_dim_dict["sparse"]))] - - linear_term = [linear_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] - if len(linear_term) > 1: - linear_term = add(linear_term) - elif len(linear_term) == 1: - linear_term = linear_term[0] + deep_emb_list, linear_logit, inputs_list = get_inputs_embedding( + feature_dim_dict, embedding_size, l2_reg_embedding, l2_reg_linear, init_std, seed) - if len(dense_input) > 0: - continuous_embedding_list = list( - map(Dense(embedding_size, use_bias=False, kernel_regularizer=l2(l2_reg_embedding), ), - dense_input)) - continuous_embedding_list = list( - map(Reshape((1, embedding_size)), continuous_embedding_list)) - embed_list += continuous_embedding_list - - dense_input_ = dense_input[0] if len( - dense_input) == 1 else Concatenate()(dense_input) - linear_dense_logit = Dense( - 1, activation=None, use_bias=False, kernel_regularizer=l2(l2_reg_linear))(dense_input_) - linear_term = add([linear_dense_logit, linear_term]) - - num_inputs = len(dense_input) + len(sparse_input) - deep_input = Reshape([num_inputs*embedding_size] - )(Concatenate()(embed_list)) + deep_input = tf.keras.layers.Flatten()(concat_fun(deep_emb_list)) deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, False, seed)(deep_input) - deep_logit = Dense(1, use_bias=False, activation=None)(deep_out) - final_logit = add([deep_logit, linear_term]) + deep_logit = tf.keras.layers.Dense( + 1, use_bias=False, activation=None)(deep_out) + final_logit = tf.keras.layers.add([deep_logit, linear_logit]) output = PredictionLayer(final_activation)(final_logit) - model = Model(inputs=sparse_input + dense_input, - outputs=output) + + model = tf.keras.models.Model(inputs=inputs_list, + outputs=output) return model diff --git a/deepctr/models/nfm.py b/deepctr/models/nfm.py index 23a85d09..a041b61d 100644 --- a/deepctr/models/nfm.py +++ b/deepctr/models/nfm.py @@ -6,12 +6,10 @@ Reference: [1] He X, Chua T S. Neural factorization machines for sparse predictive analytics[C]//Proceedings of the 40th International ACM SIGIR conference on Research and Development in Information Retrieval. ACM, 2017: 355-364. (https://arxiv.org/abs/1708.05027) """ - -from tensorflow.python.keras.layers import Dense, Concatenate, Reshape, Dropout, add -from tensorflow.python.keras.models import Model -from tensorflow.python.keras.regularizers import l2 +import tensorflow as tf from ..layers import PredictionLayer, MLP, BiInteractionPooling -from ..utils import get_input, get_share_embeddings +from ..input_embedding import get_inputs_embedding +from ..utils import concat_fun def NFM(feature_dim_dict, embedding_size=8, @@ -38,48 +36,23 @@ def NFM(feature_dim_dict, embedding_size=8, raise ValueError( "feature_dim must be a dict like {'sparse':{'field_1':4,'field_2':3,'field_3':2},'dense':['field_5',]}") - sparse_input, dense_input = get_input(feature_dim_dict, None) - sparse_embedding, linear_embedding = get_share_embeddings( - feature_dim_dict, embedding_size, init_std, seed, l2_reg_embedding, l2_reg_linear) - - embed_list = [sparse_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] - - linear_term = [linear_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] - if len(linear_term) > 1: - linear_term = add(linear_term) - elif len(linear_term) == 1: - linear_term = linear_term[0] - - if len(dense_input) > 0: - continuous_embedding_list = list( - map(Dense(embedding_size, use_bias=False, kernel_regularizer=l2(l2_reg_embedding), ), - dense_input)) - continuous_embedding_list = list( - map(Reshape((1, embedding_size)), continuous_embedding_list)) - embed_list += continuous_embedding_list - - dense_input_ = dense_input[0] if len( - dense_input) == 1 else Concatenate()(dense_input) - linear_dense_logit = Dense( - 1, activation=None, use_bias=False, kernel_regularizer=l2(l2_reg_linear))(dense_input_) - linear_term = add([linear_dense_logit, linear_term]) - - fm_input = Concatenate(axis=1)(embed_list) + deep_emb_list, linear_logit, inputs_list = get_inputs_embedding( + feature_dim_dict, embedding_size, l2_reg_embedding, l2_reg_linear, init_std, seed) + fm_input = concat_fun(deep_emb_list,axis=1) bi_out = BiInteractionPooling()(fm_input) - bi_out = Dropout(1 - keep_prob)(bi_out) + bi_out = tf.keras.layers.Dropout(1 - keep_prob)(bi_out) deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, False, seed)(bi_out) - deep_logit = Dense(1, use_bias=False, activation=None)(deep_out) + deep_logit = tf.keras.layers.Dense( + 1, use_bias=False, activation=None)(deep_out) - final_logit = linear_term # TODO add bias term + final_logit = linear_logit if len(hidden_size) > 0: - final_logit = add([final_logit, deep_logit]) + final_logit = tf.keras.layers.add([final_logit, deep_logit]) output = PredictionLayer(final_activation)(final_logit) - print(output) - model = Model(inputs=sparse_input + dense_input, outputs=output) + + model = tf.keras.models.Model(inputs=inputs_list, outputs=output) return model diff --git a/deepctr/models/pnn.py b/deepctr/models/pnn.py index fd02c611..75526c99 100644 --- a/deepctr/models/pnn.py +++ b/deepctr/models/pnn.py @@ -7,14 +7,10 @@ [1] Qu Y, Cai H, Ren K, et al. Product-based neural networks for user response prediction[C]//Data Mining (ICDM), 2016 IEEE 16th International Conference on. IEEE, 2016: 1149-1154.(https://arxiv.org/pdf/1611.00144.pdf) """ -from tensorflow.python.keras.layers import Dense, Embedding, Concatenate, Reshape, Flatten -from tensorflow.python.keras.models import Model -from tensorflow.python.keras.initializers import RandomNormal -from tensorflow.python.keras.regularizers import l2 - - +import tensorflow as tf from ..layers import PredictionLayer, MLP, InnerProductLayer, OutterProductLayer -from ..utils import get_input +from ..input_embedding import get_inputs_embedding +from ..utils import concat_fun def PNN(feature_dim_dict, embedding_size=8, hidden_size=(128, 128), l2_reg_embedding=1e-5, l2_reg_deep=0, @@ -43,48 +39,35 @@ def PNN(feature_dim_dict, embedding_size=8, hidden_size=(128, 128), l2_reg_embed "feature_dim must be a dict like {'sparse':{'field_1':4,'field_2':3,'field_3':2},'dense':['field_5',]}") if kernel_type not in ['mat', 'vec', 'num']: raise ValueError("kernel_type must be mat,vec or num") - sparse_input, dense_input = get_input(feature_dim_dict, None) - sparse_embedding = [Embedding(feature_dim_dict["sparse"][feat], embedding_size, - embeddings_initializer=RandomNormal( - mean=0.0, stddev=init_std, seed=seed), - embeddings_regularizer=l2( - l2_reg_embedding), - name='sparse_emb_' + str(i) + '-' + feat) for i, feat in - enumerate(feature_dim_dict["sparse"])] - - embed_list = [sparse_embedding[i](sparse_input[i]) - for i in range(len(feature_dim_dict["sparse"]))] + deep_emb_list, _, inputs_list = get_inputs_embedding( + feature_dim_dict, embedding_size, l2_reg_embedding, 0, init_std, seed, False) - if len(dense_input) > 0: - continuous_embedding_list = list( - map(Dense(embedding_size, use_bias=False, kernel_regularizer=l2(l2_reg_embedding), ), - dense_input)) - continuous_embedding_list = list( - map(Reshape((1, embedding_size)), continuous_embedding_list)) - embed_list += continuous_embedding_list - - inner_product = Flatten()(InnerProductLayer()(embed_list)) - outter_product = OutterProductLayer(kernel_type)(embed_list) + inner_product = tf.keras.layers.Flatten()(InnerProductLayer()(deep_emb_list)) + outter_product = OutterProductLayer(kernel_type)(deep_emb_list) # ipnn deep input - linear_signal = Reshape( - [len(embed_list)*embedding_size])(Concatenate()(embed_list)) + linear_signal = tf.keras.layers.Reshape( + [len(deep_emb_list)*embedding_size])(concat_fun(deep_emb_list)) if use_inner and use_outter: - deep_input = Concatenate()( + deep_input = tf.keras.layers.Concatenate()( [linear_signal, inner_product, outter_product]) elif use_inner: - deep_input = Concatenate()([linear_signal, inner_product]) + deep_input = tf.keras.layers.Concatenate()( + [linear_signal, inner_product]) elif use_outter: - deep_input = Concatenate()([linear_signal, outter_product]) + deep_input = tf.keras.layers.Concatenate()( + [linear_signal, outter_product]) else: deep_input = linear_signal deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, False, seed)(deep_input) - deep_logit = Dense(1, use_bias=False, activation=None)(deep_out) - final_logit = deep_logit - output = PredictionLayer(final_activation)(final_logit) - model = Model(inputs=sparse_input + dense_input, - outputs=output) + deep_logit = tf.keras.layers.Dense( + 1, use_bias=False, activation=None)(deep_out) + + output = PredictionLayer(final_activation)(deep_logit) + + model = tf.keras.models.Model(inputs=inputs_list, + outputs=output) return model diff --git a/deepctr/models/wdl.py b/deepctr/models/wdl.py index c7d6bfbb..a8bf674d 100644 --- a/deepctr/models/wdl.py +++ b/deepctr/models/wdl.py @@ -10,7 +10,7 @@ from tensorflow.python.keras.layers import Dense, Concatenate, Flatten, add from tensorflow.python.keras.models import Model from ..layers import PredictionLayer, MLP -from ..utils import get_input, get_sep_embeddings +from ..input_embedding import create_input_dict, create_embedding_dict, get_embedding_vec_list, get_inputs_list def WDL(deep_feature_dim_dict, wide_feature_dim_dict, embedding_size=8, hidden_size=(128, 128), l2_reg_linear=1e-5, l2_reg_embedding=1e-5, l2_reg_deep=0, init_std=0.0001, seed=1024, keep_prob=1, activation='relu', final_activation='sigmoid',): @@ -35,18 +35,22 @@ def WDL(deep_feature_dim_dict, wide_feature_dim_dict, embedding_size=8, hidden_s raise ValueError( "feature_dim must be a dict like {'sparse':{'field_1':4,'field_2':3,'field_3':2},'dense':['field_5',]}") - sparse_input, dense_input, bias_sparse_input, bias_dense_input = get_input( - deep_feature_dim_dict, wide_feature_dim_dict) - sparse_embedding, wide_linear_embedding = get_sep_embeddings( - deep_feature_dim_dict, wide_feature_dim_dict, embedding_size, init_std, seed, l2_reg_embedding, l2_reg_linear) + sparse_input, dense_input, = create_input_dict( + deep_feature_dim_dict) + bias_sparse_input, bias_dense_input = create_input_dict( + wide_feature_dim_dict, 'bias') + sparse_embedding = create_embedding_dict( + deep_feature_dim_dict, embedding_size, init_std, seed, l2_reg_embedding) + wide_linear_embedding = create_embedding_dict( + wide_feature_dim_dict, 1, init_std, seed, l2_reg_linear, 'linear') + + embed_list = get_embedding_vec_list(sparse_embedding, sparse_input) - embed_list = [sparse_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] deep_input = Concatenate()(embed_list) if len( embed_list) > 1 else embed_list[0] deep_input = Flatten()(deep_input) if len(dense_input) > 0: - deep_input = Concatenate()([deep_input]+dense_input) + deep_input = Concatenate()([deep_input]+list(dense_input.values())) deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, False, seed)(deep_input) @@ -54,17 +58,19 @@ def WDL(deep_feature_dim_dict, wide_feature_dim_dict, embedding_size=8, hidden_s final_logit = deep_logit if len(wide_feature_dim_dict['dense']) + len(wide_feature_dim_dict['sparse']) > 0: if len(wide_feature_dim_dict['sparse']) > 0: - bias_embed_list = [wide_linear_embedding[i]( - bias_sparse_input[i]) for i in range(len(bias_sparse_input))] + bias_embed_list = get_embedding_vec_list( + wide_linear_embedding, bias_sparse_input) linear_term = add(bias_embed_list) if len( bias_embed_list) > 1 else bias_embed_list[0] final_logit = add([final_logit, linear_term]) if len(wide_feature_dim_dict['dense']) > 0: wide_dense_term = Dense(1, use_bias=False, activation=None)(Concatenate()( - bias_dense_input) if len(bias_dense_input) > 1 else bias_dense_input[0]) + list(bias_dense_input.values())) if len(bias_dense_input) > 1 else list(bias_dense_input.values())[0]) final_logit = add([final_logit, wide_dense_term]) output = PredictionLayer(final_activation)(final_logit) - model = Model(inputs=sparse_input + dense_input + - bias_sparse_input + bias_dense_input, outputs=output) + + inputs_list = get_inputs_list( + [sparse_input, dense_input, bias_sparse_input, bias_dense_input]) + model = Model(inputs=inputs_list, outputs=output) return model diff --git a/deepctr/models/xdeepfm.py b/deepctr/models/xdeepfm.py index 03798e98..09e6a513 100644 --- a/deepctr/models/xdeepfm.py +++ b/deepctr/models/xdeepfm.py @@ -6,11 +6,10 @@ Reference: [1] Lian J, Zhou X, Zhang F, et al. xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems[J]. arXiv preprint arXiv:1803.05170, 2018.(https://arxiv.org/pdf/1803.05170.pdf) """ -from tensorflow.python.keras.layers import Dense, Concatenate, Flatten, add, Reshape -from tensorflow.python.keras.models import Model -from tensorflow.python.keras.regularizers import l2 -from deepctr.utils import get_input, get_share_embeddings -from deepctr.layers import PredictionLayer, MLP, CIN +import tensorflow as tf +from ..input_embedding import get_inputs_embedding +from ..layers import PredictionLayer, MLP, CIN +from ..utils import concat_fun def xDeepFM(feature_dim_dict, embedding_size=8, hidden_size=(256, 256), cin_layer_size=(128, 128,), cin_split_half=True, cin_activation='relu', l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_deep=0, init_std=0.0001, seed=1024, keep_prob=1, activation='relu', final_activation='sigmoid', use_bn=False): @@ -36,59 +35,36 @@ def xDeepFM(feature_dim_dict, embedding_size=8, hidden_size=(256, 256), cin_laye if not isinstance(feature_dim_dict, dict) or "sparse" not in feature_dim_dict or "dense" not in feature_dim_dict: raise ValueError( "feature_dim must be a dict like {'sparse':{'field_1':4,'field_2':3,'field_3':2},'dense':['field_5',]}") - sparse_input, dense_input = get_input(feature_dim_dict, None) - sparse_embedding, linear_embedding, = get_share_embeddings(feature_dim_dict, embedding_size, init_std, seed, l2_reg_embedding, - l2_reg_linear) - embed_list = [sparse_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] - linear_term = [linear_embedding[i](sparse_input[i]) - for i in range(len(sparse_input))] - if len(linear_term) > 1: - linear_term = add(linear_term) - elif len(linear_term) == 1: - linear_term = linear_term[0] + deep_emb_list, linear_logit, inputs_list = get_inputs_embedding( + feature_dim_dict, embedding_size, l2_reg_embedding, l2_reg_linear, init_std, seed) - if len(dense_input) > 0: - continuous_embedding_list = list( - map(Dense(embedding_size, use_bias=False, kernel_regularizer=l2(l2_reg_embedding), ), - dense_input)) - continuous_embedding_list = list( - map(Reshape((1, embedding_size)), continuous_embedding_list)) - embed_list += continuous_embedding_list - - dense_input_ = dense_input[0] if len( - dense_input) == 1 else Concatenate()(dense_input) - linear_dense_logit = Dense( - 1, activation=None, use_bias=False, kernel_regularizer=l2(l2_reg_linear))(dense_input_) - linear_term = add([linear_dense_logit, linear_term]) - - linear_logit = linear_term - - fm_input = Concatenate(axis=1)(embed_list) if len( - embed_list) > 1 else embed_list[0] + fm_input = concat_fun(deep_emb_list,axis=1) if len(cin_layer_size) > 0: exFM_out = CIN(cin_layer_size, cin_activation, cin_split_half, seed)(fm_input) - exFM_logit = Dense(1, activation=None,)(exFM_out) + exFM_logit = tf.keras.layers.Dense(1, activation=None,)(exFM_out) - deep_input = Flatten()(fm_input) + deep_input = tf.keras.layers.Flatten()(fm_input) deep_out = MLP(hidden_size, activation, l2_reg_deep, keep_prob, use_bn, seed)(deep_input) - deep_logit = Dense(1, use_bias=False, activation=None)(deep_out) + deep_logit = tf.keras.layers.Dense( + 1, use_bias=False, activation=None)(deep_out) if len(hidden_size) == 0 and len(cin_layer_size) == 0: # only linear final_logit = linear_logit elif len(hidden_size) == 0 and len(cin_layer_size) > 0: # linear + CIN - final_logit = add([linear_logit, exFM_logit]) + final_logit = tf.keras.layers.add([linear_logit, exFM_logit]) elif len(hidden_size) > 0 and len(cin_layer_size) == 0: # linear + Deep - final_logit = add([linear_logit, deep_logit]) + final_logit = tf.keras.layers.add([linear_logit, deep_logit]) elif len(hidden_size) > 0 and len(cin_layer_size) > 0: # linear + CIN + Deep - final_logit = add([linear_logit, deep_logit, exFM_logit]) + final_logit = tf.keras.layers.add( + [linear_logit, deep_logit, exFM_logit]) else: raise NotImplementedError output = PredictionLayer(final_activation)(final_logit) - model = Model(inputs=sparse_input + dense_input, outputs=output) + + model = tf.keras.models.Model(inputs=inputs_list, outputs=output) return model diff --git a/deepctr/utils.py b/deepctr/utils.py index 329b1784..3c6d860f 100644 --- a/deepctr/utils.py +++ b/deepctr/utils.py @@ -1,10 +1,10 @@ +import collections import json import logging from threading import Thread import requests -from tensorflow.python.keras.initializers import RandomNormal -from tensorflow.python.keras.layers import Embedding, Input +from tensorflow.python.keras.layers import Dense, Concatenate, add from .activations import * from .layers import * @@ -31,52 +31,36 @@ 'InteractingLayer': InteractingLayer} -def get_input(feature_dim_dict, bias_feature_dim_dict=None): - sparse_input = [Input(shape=(1,), name='sparse_' + str(i) + '-' + feat) for i, feat in - enumerate(feature_dim_dict["sparse"])] - dense_input = [Input(shape=(1,), name='dense_' + str(i) + '-' + feat) for i, feat in - enumerate(feature_dim_dict["dense"])] - if bias_feature_dim_dict is None: - return sparse_input, dense_input - else: - bias_sparse_input = [Input(shape=(1,), name='bias_sparse_' + str(i) + '-' + feat) for i, feat in - enumerate(bias_feature_dim_dict["sparse"])] - bias_dense_input = [Input(shape=(1,), name='bias_dense_' + str(i) + '-' + feat) for i, feat in - enumerate(bias_feature_dim_dict["dense"])] - return sparse_input, dense_input, bias_sparse_input, bias_dense_input - - -def get_share_embeddings(feature_dim_dict, embedding_size, init_std, seed, l2_rev_V, l2_reg_w): - sparse_embedding = [Embedding(feature_dim_dict["sparse"][feat], embedding_size, - embeddings_initializer=RandomNormal( - mean=0.0, stddev=init_std, seed=seed), - embeddings_regularizer=l2(l2_rev_V), - name='sparse_emb_' + str(i) + '-' + feat) for i, feat in - enumerate(feature_dim_dict["sparse"])] - linear_embedding = [Embedding(feature_dim_dict["sparse"][feat], 1, - embeddings_initializer=RandomNormal(mean=0.0, stddev=init_std, - seed=seed), embeddings_regularizer=l2(l2_reg_w), - name='linear_emb_' + str(i) + '-' + feat) for - i, feat in enumerate(feature_dim_dict["sparse"])] +VarLenFeature = collections.namedtuple( + 'VarLenFeatureConfig', ['name', 'dimension', 'maxlen', 'combiner']) - return sparse_embedding, linear_embedding +def get_linear_logit(linear_term, dense_input_, l2_reg): + if len(linear_term) > 1: + linear_term = add(linear_term) + elif len(linear_term) == 1: + linear_term = linear_term[0] + else: + linear_term = None -def get_sep_embeddings(deep_feature_dim_dict, wide_feature_dim_dict, embedding_size, init_std, seed, l2_rev_V, l2_reg_w): - sparse_embedding = [Embedding(deep_feature_dim_dict["sparse"][feat], embedding_size, - embeddings_initializer=RandomNormal( - mean=0.0, stddev=init_std, seed=seed), - embeddings_regularizer=l2(l2_rev_V), - name='sparse_emb_' + str(i) + '-' + feat) for i, feat in - enumerate(deep_feature_dim_dict["sparse"])] - linear_embedding = [Embedding(wide_feature_dim_dict["sparse"][feat], 1, - embeddings_initializer=RandomNormal(mean=0.0, stddev=init_std, - seed=seed), embeddings_regularizer=l2(l2_reg_w), - name='linear_emb_' + str(i) + '-' + feat) for - i, feat in enumerate(wide_feature_dim_dict["sparse"])] + dense_input = list(dense_input_.values()) + if len(dense_input) > 0: + dense_input__ = dense_input[0] if len( + dense_input) == 1 else Concatenate()(dense_input) + linear_dense_logit = Dense( + 1, activation=None, use_bias=False, kernel_regularizer=l2(l2_reg))(dense_input__) + if linear_term is not None: + linear_term = add([linear_dense_logit, linear_term]) + else: + linear_term = linear_dense_logit - return sparse_embedding, linear_embedding + return linear_term +def concat_fun(inputs,axis=-1): + if len(inputs) == 1: + return inputs[0] + else: + return Concatenate(axis=axis)(inputs) def check_version(version): """Return version of package on pypi.python.org using json.""" diff --git a/docs/pics/movielens_sample_with_genres.png b/docs/pics/movielens_sample_with_genres.png new file mode 100644 index 00000000..0fc65b05 Binary files /dev/null and b/docs/pics/movielens_sample_with_genres.png differ diff --git a/docs/source/Demo.rst b/docs/source/Demo.rst deleted file mode 100644 index b7aa0eaf..00000000 --- a/docs/source/Demo.rst +++ /dev/null @@ -1,103 +0,0 @@ -Demos -=========== - -Classification: Criteo ------------------------ - -The Criteo Display Ads dataset is for the purpose of predicting ads -click-through rate. It has 13 integer features and -26 categorical features where each category has a high cardinality. - -.. image:: ../pics/criteo_sample.png - :align: center - :scale: 70 % - -In this demo,we simply normailize the integer feature between 0 and 1,you -can try other transformation technique like log normalization or discretization. - -This example shows how to use *DeepFM* to solve a simple binary classification task. You can get the demo data -`criteo_sample.txt `_ and run the following codes. - -.. code-block:: python - - import pandas as pd - from sklearn.preprocessing import LabelEncoder,MinMaxScaler - from deepctr.models import DeepFM - - - data = pd.read_csv('./criteo_sample.txt') - - sparse_features = ['C' + str(i) for i in range(1, 27)] - dense_features = ['I'+str(i) for i in range(1,14)] - - data[sparse_features] = data[sparse_features].fillna('-1', ) - data[dense_features] = data[dense_features].fillna(0,) - - target = ['label'] - - # 1.Label Encoding for sparse features,and do simple Transformation for dense features - for feat in sparse_features: - lbe = LabelEncoder() - data[feat] = lbe.fit_transform(data[feat]) - mms = MinMaxScaler(feature_range=(0,1)) - data[dense_features] = mms.fit_transform(data[dense_features]) - - # 2.count #unique features for each sparse field,and record dense feature field name - - sparse_feature_dict = {feat: data[feat].nunique() for feat in sparse_features} - dense_feature_list = dense_features - - # 3.generate input data for model - - model_input = [data[feat].values for feat in sparse_feature_dict] + [data[feat].values for feat in dense_feature_list] - - #4.Define Model,compile and - - - model = DeepFM({"sparse": sparse_feature_dict, "dense": dense_feature_list}, final_activation='sigmoid') - model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) - history = model.fit(model_input, data[target].values, - batch_size=256, epochs=1, verbose=2, validation_split=0.2,) - - - -Regression: Movielens ----------------------- -The MovieLens data has been used for personalized tag recommendation,which -contains 668, 953 tag applications of users on movies. -Here is a small fraction of data include only sparse field. - -.. image:: ../pics/movielens_sample.png - :align: center - :scale: 70 % - -This example shows how to use *DeepFM* to solve a simple binary regression task. You can get the demo data -`movielens_sample.txt `_ and run the following codes. - -.. code-block:: python - - import pandas as pd - from sklearn.preprocessing import LabelEncoder,MinMaxScaler - from deepctr.models import DeepFM - - - data = pd.read_csv("./movielens_sample.txt") - sparse_features = [ "movie_id","user_id","gender","age","occupation","zip"] - target = ['rating'] - - # 1.Label Encoding for sparse features,and do simple Transformation for dense features - for feat in sparse_features: - lbe = LabelEncoder() - data[feat] = lbe.fit_transform(data[feat]) - #2.count #unique features for each sparse field - sparse_feature_dim = {feat:data[feat].nunique() for feat in sparse_features} - #3.generate input data for model - model_input = [data[feat].values for feat in sparse_feature_dim] - #4.Define Model,compile and train - model = DeepFM({"sparse":sparse_feature_dim,"dense":[]},final_activation='linear') - - model.compile("adam","mse",metrics=['mse'],) - history = model.fit(model_input,data[target].values, - batch_size=256,epochs=10,verbose=2,validation_split=0.2,) \ No newline at end of file diff --git a/docs/source/Examples.md b/docs/source/Examples.md new file mode 100644 index 00000000..a7d11996 --- /dev/null +++ b/docs/source/Examples.md @@ -0,0 +1,174 @@ +# Examples + + +## Classification: Criteo + +The Criteo Display Ads dataset is for the purpose of predicting ads +click-through rate. It has 13 integer features and +26 categorical features where each category has a high cardinality. + +![image](../pics/criteo_sample.png) + +In this example,we simply normailize the integer feature between 0 and 1,you +can try other transformation technique like log normalization or discretization. + +This example shows how to use ``DeepFM`` to solve a simple binary classification task. You can get the demo data [criteo_sample.txt](https://github.com/shenweichen/DeepCTR/tree/master/examples/criteo_sample.txt) +and run the following codes. + +```python +import pandas as pd +from sklearn.preprocessing import LabelEncoder,MinMaxScaler +from deepctr.models import DeepFM + + +data = pd.read_csv('./criteo_sample.txt') + +sparse_features = ['C' + str(i) for i in range(1, 27)] +dense_features = ['I'+str(i) for i in range(1,14)] + +data[sparse_features] = data[sparse_features].fillna('-1', ) +data[dense_features] = data[dense_features].fillna(0,) + +target = ['label'] + +# 1.Label Encoding for sparse features,and do simple Transformation for dense features +for feat in sparse_features: + lbe = LabelEncoder() + data[feat] = lbe.fit_transform(data[feat]) +mms = MinMaxScaler(feature_range=(0,1)) +data[dense_features] = mms.fit_transform(data[dense_features]) + +# 2.count #unique features for each sparse field,and record dense feature field name + +sparse_feature_dict = {feat: data[feat].nunique() for feat in sparse_features} +dense_feature_list = dense_features + +# 3.generate input data for model + +model_input = [data[feat].values for feat in sparse_feature_dict] + [data[feat].values for feat in dense_feature_list] + +#4.Define Model,compile and + + +model = DeepFM({"sparse": sparse_feature_dict, "dense": dense_feature_list}, final_activation='sigmoid') +model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) +history = model.fit(model_input, data[target].values, + batch_size=256, epochs=1, verbose=2, validation_split=0.2,) +``` + +## Regression: Movielens + +The MovieLens data has been used for personalized tag recommendation,which +contains 668, 953 tag applications of users on movies. +Here is a small fraction of data include only sparse field. + +![image](../pics/movielens_sample.png) + + +This example shows how to use ``DeepFM`` to solve a simple binary regression task. You can get the demo data +[movielens_sample.txt](https://github.com/shenweichen/DeepCTR/tree/master/examples/movielens_sample.txt) and run the following codes. + +```python +import pandas as pd +from sklearn.preprocessing import LabelEncoder,MinMaxScaler +from deepctr.models import DeepFM + + +data = pd.read_csv("./movielens_sample.txt") +sparse_features = [ "movie_id","user_id","gender","age","occupation","zip"] +target = ['rating'] + +# 1.Label Encoding for sparse features,and do simple Transformation for dense features +for feat in sparse_features: + lbe = LabelEncoder() + data[feat] = lbe.fit_transform(data[feat]) +#2.count #unique features for each sparse field +sparse_feature_dim = {feat:data[feat].nunique() for feat in sparse_features} +#3.generate input data for model +model_input = [data[feat].values for feat in sparse_feature_dim] +#4.Define Model,compile and train +model = DeepFM({"sparse":sparse_feature_dim,"dense":[]},final_activation='linear') + +model.compile("adam","mse",metrics=['mse'],) +history = model.fit(model_input,data[target].values, + batch_size=256,epochs=10,verbose=2,validation_split=0.2,) +``` +## Multi-value Input : Movielens +---------------------------------- + +The MovieLens data has been used for personalized tag recommendation,which +contains 668, 953 tag applications of users on movies. +Here is a small fraction of data include sparse fields and a multivalent field. + +![image](../pics/movielens_sample_with_genres.png) + +There are 2 additional steps to use DeepCTR with sequence feature input. + +1. Generate the paded and encoded sequence feature and valid length of sequence feature. +2. Generate config of sequence feature with `deepctr.utils.VarLenFeature` + +``VarLenFeature`` is a namedtuple with signature ``VarLenFeature(name, dimension, maxlen, combiner)`` + +- name : feature name,if it is already used in sparse_feature_dim,then a shared embedding mechanism will be used. +- dimension : number of unique features +- maxlen : maximum length of this feature for all samples +- combiner : pooling method,can be ``sum``,``mean`` or ``max`` + +Now multi-value input is avaliable for `AFM,AutoInt,DCN,DeepFM,FNN,NFM,PNN,xDeepFM`,for `DIN` please read the example in [run_din.py](https://github.com/shenweichen/DeepCTR/blob/master/examples/run_din.py) +This example shows how to use ``DeepFM`` with sequence(multi-value) feature. You can get the demo data +[movielens_sample.txt](https://github.com/shenweichen/DeepCTR/tree/master/examples/movielens_sample.txt) and run the following codes. + +```python +import pandas as pd +import numpy as np +from tensorflow.python.keras.preprocessing.sequence import pad_sequences +from deepctr.models import DeepFM +from deepctr.utils import VarLenFeature + + +def split(x): + key_ans = x.split('|') + for key in key_ans: + if key not in key2index: + key2index[key] = len(key2index) + return list(map(lambda x: key2index[x], key_ans)) + + +data = pd.read_csv("./movielens_sample.txt") +sparse_features = ["movie_id", "user_id", + "gender", "age", "occupation", "zip", ] +target = ['rating'] + +# 1.Label Encoding for sparse features,and process sequence features +for feat in sparse_features: + lbe = LabelEncoder() + data[feat] = lbe.fit_transform(data[feat]) +# preprocess the sequence feature + +key2index = {} +genres_list = list(map(split, data['genres'].values)) +genres_length = np.array(list(map(len, genres_list))) +max_len = max(genres_length) +genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post',)# Notice : padding='post' + +# 2.count #unique features for each sparse field and generate feature config for sequence feature + +sparse_feature_dim = {feat: data[feat].nunique() for feat in sparse_features} +sequence_feature = [VarLenFeature('genres', len(key2index), max_len, 'mean')] + +# 3.generate input data for model +sparse_input = [data[feat].values for feat in sparse_feature_dim] +dense_input = [] +sequence_input = [genres_list] +sequence_length_input = [genres_length] +model_input = sparse_input + dense_input + sequence_input + \ + sequence_length_input # make sure the order is right + +# 4.Define Model,compile and train +model = DeepFM({"sparse": sparse_feature_dim, "dense": [], + "sequence": sequence_feature}, final_activation='linear') + +model.compile("adam", "mse", metrics=['mse'],) +history = model.fit(model_input, data[target].values, + batch_size=256, epochs=10, verbose=2, validation_split=0.2,) +``` \ No newline at end of file diff --git a/docs/source/FAQ.md b/docs/source/FAQ.md new file mode 100644 index 00000000..32e52ae3 --- /dev/null +++ b/docs/source/FAQ.md @@ -0,0 +1,71 @@ +# FAQ + +## 1. Save or load weights/models +---------------------------------------- +To save/load weights,you can write codes just like any other keras models. + +```python +model = DeepFM() +model.save_weights('DeepFM_w.h5') +model.load_weights('DeepFM_w.h5') +``` + +To save/load models,just a little different. + +```python +from tensorflow.python.keras.models import save_model,load_model +model = DeepFM() +save_model(model, 'DeepFM.h5')# save_model, same as before + +from deepctr.utils import custom_objects +model = load_model('DeepFM.h5',custom_objects)# load_model,just add a parameter +``` +## 2. Set learning rate and use earlystopping +--------------------------------------------------- +You can use any models in DeepCTR like a keras model object. +Here is a example of how to set learning rate and earlystopping: + +```python +import deepctr +from tensorflow.python.keras.optimizers import Adam,Adagrad +from tensorflow.python.keras.callbacks import EarlyStopping + +model = deepctr.models.DeepFM({"sparse": sparse_feature_dict, "dense": dense_feature_list}) +model.compile(Adagrad('0.0808'),'binary_crossentropy',metrics=['binary_crossentropy']) + +es = EarlyStopping(monitor='val_binary_crossentropy') +history = model.fit(model_input, data[target].values,batch_size=256, epochs=10, verbose=2, validation_split=0.2,callbacks=[es] ) +``` + + +## 3. Get the attentional weights of feature interactions in AFM +-------------------------------------------------------------------------- +First,make sure that you have install the latest version of deepctr. + +Then,use the following code,the `attentional_weights[:,i,0]` is the `feature_interactions[i]`'s attentional weight of all samples. + +```python +import itertools +import deepctr +from tensorflow.python.keras.models import Model +from tensorflow.python.keras.layers import Lambda + +feature_dim_dict = {"sparse": sparse_feature_dict, "dense": dense_feature_list} +model = deepctr.models.AFM(feature_dim_dict) +model.fit(model_input,target) + +afmlayer = model.layers[-3] +afm_weight_model = Model(model.input,outputs=Lambda(lambda x:afmlayer.normalized_att_score)(model.input)) +attentional_weights = afm_weight_model.predict(model_input,batch_size=4096) +feature_interactions = list(itertools.combinations(list(feature_dim_dict['sparse'].keys()) + feature_dim_dict['dense'] ,2)) +``` + +## 4. Does the models support multi-value input? +--------------------------------------------------- +Now multi-value input is avaliable for `AFM,AutoInt,DCN,DeepFM,FNN,NFM,PNN,xDeepFM`,you can read the example [here](./Examples.html#multi-value-input-movielens). + +For `DIN` please read the code example in [run_din.py](https://github.com/shenweichen/DeepCTR/blob/master/examples/run_din.py +). + +You can use layers in [sequence](./sequence.html)to build your own models! +And it will be supported in a future release \ No newline at end of file diff --git a/docs/source/FAQ.rst b/docs/source/FAQ.rst deleted file mode 100644 index 4d00eb54..00000000 --- a/docs/source/FAQ.rst +++ /dev/null @@ -1,70 +0,0 @@ -FAQ -========== -1. Save or load weights/models ----------------------------------------- -To save/load weights,you can write codes just like any other keras models. - -.. code-block:: python - - model = DeepFM() - model.save_weights('DeepFM_w.h5') - model.load_weights('DeepFM_w.h5') - - -To save/load models,just a little different. - -.. code-block:: python - - from tensorflow.python.keras.models import save_model,load_model - model = DeepFM() - save_model(model, 'DeepFM.h5')# save_model, same as before - - from deepctr.utils import custom_objects - model = load_model('DeepFM.h5',custom_objects)# load_model,just add a parameter - -2. Set learning rate and use earlystopping ---------------------------------------------------- -You can use any models in DeepCTR like a keras model object. -Here is a example of how to set learning rate and earlystopping: - -.. code-block:: python - - import deepctr - from tensorflow.python.keras.optimizers import Adam,Adagrad - from tensorflow.python.keras.callbacks import EarlyStopping - - model = deepctr.models.DeepFM({"sparse": sparse_feature_dict, "dense": dense_feature_list}) - model.compile(Adagrad('0.0808'),'binary_crossentropy',metrics=['binary_crossentropy']) - - es = EarlyStopping(monitor='val_binary_crossentropy') - history = model.fit(model_input, data[target].values,batch_size=256, epochs=10, verbose=2, validation_split=0.2,callbacks=[es] ) - - -3. Get the attentional weights of feature interactions in AFM --------------------------------------------------------------------------- -First,make sure that you have install the latest version of deepctr. - -Then,use the following code,the ``attentional_weights[:,i,0]`` is the ``feature_interactions[i]``'s attentional weight of all samples. - -.. code-block:: python - - import itertools - import deepctr - from tensorflow.python.keras.models import Model - from tensorflow.python.keras.layers import Lambda - - feature_dim_dict = {"sparse": sparse_feature_dict, "dense": dense_feature_list} - model = deepctr.models.AFM(feature_dim_dict) - model.fit(model_input,target) - - afmlayer = model.layers[-3] - afm_weight_model = Model(model.input,outputs=Lambda(lambda x:afmlayer.normalized_att_score)(model.input)) - attentional_weights = afm_weight_model.predict(model_input,batch_size=4096) - feature_interactions = list(itertools.combinations(list(feature_dim_dict['sparse'].keys()) + feature_dim_dict['dense'] ,2)) - - - -4. Does the models support multi-value input? ---------------------------------------------------- -Now only the `DIN `_ model support multi-value input,you can use layers in `sequence `_ to build your own models! -And it will be supported in a future release \ No newline at end of file diff --git a/docs/source/History.md b/docs/source/History.md index 152b0d54..6c601a5b 100644 --- a/docs/source/History.md +++ b/docs/source/History.md @@ -1,4 +1,5 @@ # History +- 01/01/2019 : [v0.2.2](https://github.com/shenweichen/DeepCTR/releases/tag/v0.2.2) released.Add [sequence(multi-value) input support](./Examples.html#multi-value-input-movielens) for `AFM,AutoInt,DCN,DeepFM,FNN,NFM,PNN,xDeepFM` models. - 12/27/2018 : [v0.2.1](https://github.com/shenweichen/DeepCTR/releases/tag/v0.2.1) released.Add [AutoInt](./Features.html#autoint-automatic-feature-interactiont) Model. - 12/22/2018 : [v0.2.0](https://github.com/shenweichen/DeepCTR/releases/tag/v0.2.0) released.Add [xDeepFM](./Features.html#xdeepfm) and automatic check for new version. - 12/19/2018 : [v0.1.6](https://github.com/shenweichen/DeepCTR/releases/tag/v0.1.6) released.Now DeepCTR is compatible with tensorflow from `1.4-1.12` except for `1.7` and `1.8`. diff --git a/docs/source/Quick-Start.rst b/docs/source/Quick-Start.rst index a83983e1..7ce7427e 100644 --- a/docs/source/Quick-Start.rst +++ b/docs/source/Quick-Start.rst @@ -88,7 +88,7 @@ There are two rules here that we must follow batch_size=256, epochs=1, verbose=2, validation_split=0.2,) -You can check the full code `here <./Demo.html>`_ +You can check the full code `here <./Examples.html#classification-criteo>`_ diff --git a/docs/source/conf.py b/docs/source/conf.py index 844ce894..dd3a361d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -26,7 +26,7 @@ # The short X.Y version version = '' # The full version, including alpha/beta/rc tags -release = '0.2.1' +release = '0.2.2' # -- General configuration --------------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index a8d51c17..385a92de 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -35,22 +35,20 @@ You can read the latest code at https://github.com/shenweichen/DeepCTR News ----- +01/01/2019 : Add `sequence(multi-value) input support <./Examples.html#multi-value-input-movielens>`_ for ``AFM,AutoInt,DCN,DeepFM,FNN,NFM,PNN,xDeepFM`` models. `Changelog `_ + 12/27/2018 : Add `AutoInt <./Features.html#autoint-automatic-feature-interaction>`_ . `Changelog `_ 12/22/2018 : Add `xDeepFM <./Features.html#xdeepfm>`_ and automatic check for new version. `Changelog `_ -12/19/2018 : DeepCTR is compatible with tensorflow from ``1.4-1.12`` except for ``1.7`` and ``1.8``. `Changelog `_ - -11/24/2018 : DeepCTR is released! `PyPi `_. - .. toctree:: :maxdepth: 2 :caption: Home: Quick-Start Features - Demo - FAQ + Examples + FAQ History .. toctree:: diff --git a/examples/run_multivalue_movielens.py b/examples/run_multivalue_movielens.py new file mode 100644 index 00000000..a01af77f --- /dev/null +++ b/examples/run_multivalue_movielens.py @@ -0,0 +1,52 @@ +import pandas as pd +import numpy as np +from tensorflow.python.keras.preprocessing.sequence import pad_sequences +from deepctr.models import DeepFM +from deepctr.utils import VarLenFeature + + +def split(x): + key_ans = x.split('|') + for key in key_ans: + if key not in key2index: + key2index[key] = len(key2index) + return list(map(lambda x: key2index[x], key_ans)) + + +data = pd.read_csv("./movielens_sample.txt") +sparse_features = ["movie_id", "user_id", + "gender", "age", "occupation", "zip", ] +target = ['rating'] + +# 1.Label Encoding for sparse features,and process sequence features +for feat in sparse_features: + lbe = LabelEncoder() + data[feat] = lbe.fit_transform(data[feat]) +# preprocess the sequence feature + +key2index = {} +genres_list = list(map(split, data['genres'].values)) +genres_length = np.array(list(map(len, genres_list))) +max_len = max(genres_length) +genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post',) + +# 2.count #unique features for each sparse field and generate feature config for sequence feature + +sparse_feature_dim = {feat: data[feat].nunique() for feat in sparse_features} +sequence_feature = [VarLenFeature('genres', len(key2index), max_len, 'mean')] + +# 3.generate input data for model +sparse_input = [data[feat].values for feat in sparse_feature_dim] +dense_input = [] +sequence_input = [genres_list] +sequence_length_input = [genres_length] +model_input = sparse_input + dense_input + sequence_input + \ + sequence_length_input # make sure the order is right + +# 4.Define Model,compile and train +model = DeepFM({"sparse": sparse_feature_dim, "dense": [], + "sequence": sequence_feature}, final_activation='linear') + +model.compile("adam", "mse", metrics=['mse'],) +history = model.fit(model_input, data[target].values, + batch_size=256, epochs=10, verbose=2, validation_split=0.2,) diff --git a/setup.py b/setup.py index 012d6adc..7fe7f9e7 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setuptools.setup( name="deepctr", - version="0.2.1", + version="0.2.2", author="Weichen Shen", author_email="wcshen1994@163.com", description="Easy-to-use,Modular and Extendible package of deep learning based CTR(Click Through Rate) prediction models with tensorflow.", diff --git a/tests/models/AFM_test.py b/tests/models/AFM_test.py index 17a596eb..4d42508f 100644 --- a/tests/models/AFM_test.py +++ b/tests/models/AFM_test.py @@ -1,37 +1,22 @@ -import numpy as np import pytest from deepctr.models import AFM -from ..utils import check_model +from ..utils import check_model, get_test_data @pytest.mark.parametrize( - 'use_attention,sparse_feature_num', - [(True, 1), (False, 3) + 'use_attention,sparse_feature_num,dense_feature_num', + [(True, 1, 1), (False, 3, 3), ] ) -def test_AFM(use_attention, sparse_feature_num): +def test_AFM(use_attention, sparse_feature_num, dense_feature_num): model_name = "AFM" - sample_size = 64 - feature_dim_dict = {"sparse": {}, 'dense': []} - for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]): - if name == "sparse": - for i in range(num): - feature_dim_dict[name][name + '_' + - str(i)] = np.random.randint(1, 10) - else: - for i in range(num): - feature_dim_dict[name].append(name + '_' + str(i)) - sparse_input = [np.random.randint(0, dim, sample_size) - for dim in feature_dim_dict['sparse'].values()] - dense_input = [np.random.random(sample_size) - for name in feature_dim_dict['dense']] - y = np.random.randint(0, 2, sample_size) - x = sparse_input + dense_input + x, y, feature_dim_dict = get_test_data( + sample_size, sparse_feature_num, dense_feature_num) model = AFM(feature_dim_dict, use_attention=use_attention, keep_prob=0.5,) check_model(model, model_name, x, y) if __name__ == "__main__": - test_AFM(use_attention=True, sparse_feature_num=2) + test_AFM(use_attention=True, sparse_feature_num=2, dense_feature_num=2) diff --git a/tests/models/AutoInt_test.py b/tests/models/AutoInt_test.py index 260a9ddc..2abd5270 100644 --- a/tests/models/AutoInt_test.py +++ b/tests/models/AutoInt_test.py @@ -1,7 +1,6 @@ -import numpy as np import pytest from deepctr.models import AutoInt -from ..utils import check_model +from ..utils import check_model, get_test_data @pytest.mark.parametrize( @@ -11,22 +10,8 @@ def test_AutoInt(att_layer_num, hidden_size, sparse_feature_num): model_name = "AutoInt" sample_size = 64 - feature_dim_dict = {"sparse": {}, 'dense': []} - for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]): - if name == "sparse": - for i in range(num): - feature_dim_dict[name][name + '_' + - str(i)] = np.random.randint(1, 10) - else: - for i in range(num): - feature_dim_dict[name].append(name + '_' + str(i)) - - sparse_input = [np.random.randint(0, dim, sample_size) - for dim in feature_dim_dict['sparse'].values()] - dense_input = [np.random.random(sample_size) - for name in feature_dim_dict['dense']] - y = np.random.randint(0, 2, sample_size) - x = sparse_input + dense_input + x, y, feature_dim_dict = get_test_data( + sample_size, sparse_feature_num, sparse_feature_num) model = AutoInt(feature_dim_dict, att_layer_num=att_layer_num, hidden_size=hidden_size, keep_prob=0.5, ) @@ -34,4 +19,4 @@ def test_AutoInt(att_layer_num, hidden_size, sparse_feature_num): if __name__ == "__main__": - test_AutoInt(True, (32, 32), 2) + test_AutoInt(2, (32, 32), 2) diff --git a/tests/models/DCN_test.py b/tests/models/DCN_test.py index 9ed21a1a..822d341f 100644 --- a/tests/models/DCN_test.py +++ b/tests/models/DCN_test.py @@ -1,34 +1,19 @@ -import numpy as np import pytest from deepctr.models import DCN -from ..utils import check_model +from ..utils import check_model, get_test_data @pytest.mark.parametrize( 'embedding_size,cross_num,hidden_size,sparse_feature_num', - [(8, 0, (32,), 2), (8, 1, (), 1), ('auto', 1, (32,), 3) + [(8, 0, (32,), 2), ('auto', 1, (), 1), ('auto', 1, (32,), 3) ] ) def test_DCN(embedding_size, cross_num, hidden_size, sparse_feature_num): model_name = "DCN" sample_size = 64 - feature_dim_dict = {"sparse": {}, 'dense': []} - for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]): - if name == "sparse": - for i in range(num): - feature_dim_dict[name][name + '_' + - str(i)] = np.random.randint(1, 10) - else: - for i in range(num): - feature_dim_dict[name].append(name + '_' + str(i)) - sparse_input = [np.random.randint(0, dim, sample_size) - for dim in feature_dim_dict['sparse'].values()] - dense_input = [np.random.random(sample_size) - for name in feature_dim_dict['dense']] - - y = np.random.randint(0, 2, sample_size) - x = sparse_input + dense_input + x, y, feature_dim_dict = get_test_data( + sample_size, sparse_feature_num, sparse_feature_num) model = DCN(feature_dim_dict, embedding_size=embedding_size, cross_num=cross_num, hidden_size=hidden_size, keep_prob=0.5, ) diff --git a/tests/models/DeepFM_test.py b/tests/models/DeepFM_test.py index 4437ffbb..54b7a7fc 100644 --- a/tests/models/DeepFM_test.py +++ b/tests/models/DeepFM_test.py @@ -1,7 +1,6 @@ -import numpy as np import pytest from deepctr.models import DeepFM -from ..utils import check_model +from ..utils import check_model, get_test_data @pytest.mark.parametrize( @@ -12,22 +11,8 @@ def test_DeepFM(use_fm, hidden_size, sparse_feature_num): model_name = "DeepFM" sample_size = 64 - feature_dim_dict = {"sparse": {}, 'dense': []} - for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]): - if name == "sparse": - for i in range(num): - feature_dim_dict[name][name + '_' + - str(i)] = np.random.randint(1, 10) - else: - for i in range(num): - feature_dim_dict[name].append(name + '_' + str(i)) - - sparse_input = [np.random.randint(0, dim, sample_size) - for dim in feature_dim_dict['sparse'].values()] - dense_input = [np.random.random(sample_size) - for name in feature_dim_dict['dense']] - y = np.random.randint(0, 2, sample_size) - x = sparse_input + dense_input + x, y, feature_dim_dict = get_test_data( + sample_size, sparse_feature_num, sparse_feature_num) model = DeepFM(feature_dim_dict, use_fm=use_fm, hidden_size=hidden_size, keep_prob=0.5, ) diff --git a/tests/models/FNN_test.py b/tests/models/FNN_test.py index 7c51a656..0af5aa25 100644 --- a/tests/models/FNN_test.py +++ b/tests/models/FNN_test.py @@ -1,39 +1,40 @@ -import numpy as np import pytest from deepctr.models import FNN -from ..utils import check_model +from ..utils import check_model, get_test_data @pytest.mark.parametrize( - 'sparse_feature_num', - [1, 3 + 'sparse_feature_num,dense_feature_num', + [(1, 1), (3, 3) ] ) -def test_FNN(sparse_feature_num): +def test_FNN(sparse_feature_num, dense_feature_num): model_name = "FNN" sample_size = 64 - feature_dim_dict = {"sparse": {}, 'dense': []} - for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]): - if name == "sparse": - for i in range(num): - feature_dim_dict[name][name + '_' + - str(i)] = np.random.randint(1, 10) - else: - for i in range(num): - feature_dim_dict[name].append(name + '_' + str(i)) - - sparse_input = [np.random.randint(0, dim, sample_size) - for dim in feature_dim_dict['sparse'].values()] - dense_input = [np.random.random(sample_size) - for name in feature_dim_dict['dense']] - y = np.random.randint(0, 2, sample_size) - x = sparse_input + dense_input + x, y, feature_dim_dict = get_test_data( + sample_size, sparse_feature_num, dense_feature_num) + + model = FNN(feature_dim_dict, hidden_size=[32, 32], keep_prob=0.5, ) + check_model(model, model_name, x, y) + + +@pytest.mark.parametrize( + 'sparse_feature_num,dense_feature_num', + [(0, 1), (1, 0) + ] +) +def test_FNN_without_seq(sparse_feature_num, dense_feature_num): + model_name = "FNN" + + sample_size = 64 + x, y, feature_dim_dict = get_test_data( + sample_size, sparse_feature_num, dense_feature_num, sequence_feature=()) model = FNN(feature_dim_dict, hidden_size=[32, 32], keep_prob=0.5, ) check_model(model, model_name, x, y) if __name__ == "__main__": - test_FNN(2) + test_FNN(2, 2) diff --git a/tests/models/NFM_test.py b/tests/models/NFM_test.py index b612e140..7e88898f 100644 --- a/tests/models/NFM_test.py +++ b/tests/models/NFM_test.py @@ -1,7 +1,6 @@ -import numpy as np import pytest from deepctr.models import NFM -from ..utils import check_model +from ..utils import check_model, get_test_data @pytest.mark.parametrize( @@ -13,21 +12,8 @@ def test_NFM(hidden_size, sparse_feature_num): model_name = "NFM" sample_size = 64 - feature_dim_dict = {"sparse": {}, 'dense': []} - for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]): - if name == "sparse": - for i in range(num): - feature_dim_dict[name][name + '_' + - str(i)] = np.random.randint(1, 10) - else: - for i in range(num): - feature_dim_dict[name].append(name + '_' + str(i)) - sparse_input = [np.random.randint(0, dim, sample_size) - for dim in feature_dim_dict['sparse'].values()] - dense_input = [np.random.random(sample_size) - for name in feature_dim_dict['dense']] - y = np.random.randint(0, 2, sample_size) - x = sparse_input + dense_input + x, y, feature_dim_dict = get_test_data( + sample_size, sparse_feature_num, sparse_feature_num) model = NFM(feature_dim_dict, embedding_size=8, hidden_size=[32, 32], keep_prob=0.5, ) diff --git a/tests/models/PNN_test.py b/tests/models/PNN_test.py index e6ae159a..1a9816a8 100644 --- a/tests/models/PNN_test.py +++ b/tests/models/PNN_test.py @@ -1,8 +1,7 @@ -import numpy as np import pytest from deepctr.models import PNN -from ..utils import check_model +from ..utils import check_model, get_test_data @pytest.mark.parametrize( @@ -13,22 +12,8 @@ def test_PNN(use_inner, use_outter, sparse_feature_num): model_name = "PNN" sample_size = 64 - feature_dim_dict = {"sparse": {}, 'dense': []} - for name, num in zip(["sparse", "dense"], [sparse_feature_num, sparse_feature_num]): - if name == "sparse": - for i in range(num): - feature_dim_dict[name][name + '_' + - str(i)] = np.random.randint(1, 10) - else: - for i in range(num): - feature_dim_dict[name].append(name + '_' + str(i)) - sparse_input = [np.random.randint(0, dim, sample_size) - for dim in feature_dim_dict['sparse'].values()] - dense_input = [np.random.random(sample_size) - for name in feature_dim_dict['dense']] - y = np.random.randint(0, 2, sample_size) - x = sparse_input + dense_input - + x, y, feature_dim_dict = get_test_data( + sample_size, sparse_feature_num, sparse_feature_num) model = PNN(feature_dim_dict, embedding_size=8, hidden_size=[32, 32], keep_prob=0.5, use_inner=use_inner, use_outter=use_outter) check_model(model, model_name, x, y) diff --git a/tests/models/xDeepFM_test.py b/tests/models/xDeepFM_test.py index bd521fa6..1f8a4d47 100644 --- a/tests/models/xDeepFM_test.py +++ b/tests/models/xDeepFM_test.py @@ -1,7 +1,6 @@ -import numpy as np import pytest from deepctr.models import xDeepFM -from ..utils import check_model +from ..utils import check_model, get_test_data @pytest.mark.parametrize( @@ -13,22 +12,8 @@ def test_xDeepFM(hidden_size, cin_layer_size, cin_split_half, cin_activation, sp model_name = "xDeepFM" sample_size = 64 - feature_dim_dict = {"sparse": {}, 'dense': []} - for name, num in zip(["sparse", "dense"], [sparse_feature_num, dense_feature_dim]): - if name == "sparse": - for i in range(num): - feature_dim_dict[name][name + '_' + - str(i)] = np.random.randint(1, 10) - else: - for i in range(num): - feature_dim_dict[name].append(name + '_' + str(i)) - sparse_input = [np.random.randint(0, dim, sample_size) - for dim in feature_dim_dict['sparse'].values()] - dense_input = [np.random.random(sample_size) - for name in feature_dim_dict['dense']] - - y = np.random.randint(0, 2, sample_size) - x = sparse_input + dense_input + x, y, feature_dim_dict = get_test_data( + sample_size, sparse_feature_num, sparse_feature_num) model = xDeepFM(feature_dim_dict, hidden_size=hidden_size, cin_layer_size=cin_layer_size, cin_split_half=cin_split_half, cin_activation=cin_activation, keep_prob=0.5, ) diff --git a/tests/utils.py b/tests/utils.py index 82d5f398..e21fef95 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -6,57 +6,48 @@ from tensorflow.python.keras import backend as K from tensorflow.python.keras.layers import Input from tensorflow.python.keras.models import Model, save_model, load_model -from deepctr.utils import custom_objects - - -def get_test_data(num_train=1000, num_test=500, input_shape=(10,), - - output_shape=(2,), - - classification=True, num_classes=2): - """Generates test data to train a model on. - - - - classification=True overrides output_shape - - (i.e. output_shape is set to (1,)) and the output - - consists in integers in [0, num_classes-1]. - - - - Otherwise: float output with shape output_shape. - - """ - - samples = num_train + num_test - +from deepctr.utils import custom_objects, VarLenFeature + + +def gen_sequence(dim, max_len, sample_size): + return np.array([np.random.randint(0, dim, max_len) for _ in range(sample_size)]), np.random.randint(1, max_len + 1, sample_size) + + +def get_test_data(sample_size=1000, sparse_feature_num=1, dense_feature_num=1, sequence_feature=('max', 'mean', 'sum'), + classification=True,): + + feature_dim_dict = {"sparse": {}, 'dense': [], 'sequence': []} + + for i in range(sparse_feature_num): + dim = np.random.randint(1, 10) + feature_dim_dict['sparse']['sparse_'+str(i)] = dim + for i in range(dense_feature_num): + feature_dim_dict['dense'].append('dense_'+str(i)) + for i, mode in enumerate(sequence_feature): + dim = np.random.randint(1, 10) + maxlen = np.random.randint(1, 10) + feature_dim_dict['sequence'].append( + VarLenFeature('sequence_'+str(i), dim, maxlen, mode)) + + sparse_input = [np.random.randint(0, dim, sample_size) + for dim in feature_dim_dict['sparse'].values()] + dense_input = [np.random.random(sample_size) + for name in feature_dim_dict['dense']] + sequence_input = [] + sequence_len_input = [] + for var in feature_dim_dict['sequence']: + s_input, s_len_input = gen_sequence( + var.dimension, var.maxlen, sample_size) + sequence_input.append(s_input) + sequence_len_input.append(s_len_input) if classification: - - y = np.random.randint(0, num_classes, size=(samples,)) - - X = np.zeros((samples,) + input_shape, dtype=np.float32) - - for i in range(samples): - - X[i] = np.random.normal(loc=y[i], scale=0.7, size=input_shape) - + y = np.random.randint(0, 2, sample_size) else: + y = np.random.random(sample_size) - y_loc = np.random.random((samples,)) - - X = np.zeros((samples,) + input_shape, dtype=np.float32) - - y = np.zeros((samples,) + output_shape, dtype=np.float32) + x = sparse_input + dense_input + sequence_input + sequence_len_input - for i in range(samples): - - X[i] = np.random.normal(loc=y_loc[i], scale=0.7, size=input_shape) - - y[i] = np.random.normal(loc=y_loc[i], scale=0.7, size=output_shape) - - return (X[:num_train], y[:num_train]), (X[num_train:], y[num_train:]) + return x, y, feature_dim_dict def layer_test(layer_cls, kwargs={}, input_shape=None, input_dtype=None, @@ -64,12 +55,6 @@ def layer_test(layer_cls, kwargs={}, input_shape=None, input_dtype=None, input_data=None, expected_output=None, expected_output_dtype=None, fixed_batch_size=False): - """Test routine for a layer with a single input tensor - - and single output tensor. - - """ - # generate input data if input_data is None: