diff --git a/Makefile b/Makefile index 10af3fa..ab673bc 100644 --- a/Makefile +++ b/Makefile @@ -10,10 +10,10 @@ gklib_path = not-set bcls_path = not-set shared = not-set with_mkl = not-set -cc = /usr/bin/gcc -cxx = /usr/bin/g++ -#cc = gcc-mp-4.9 -#cxx = g++-mp-4.9 +cc = not-set +cxx = not-set +#cc = /usr/bin/gcc +#cxx = /usr/bin/g++ #=============================================================== # There should be no need to modify beyond this point diff --git a/include/slim.h b/include/slim.h index 6a39230..6fba4f1 100644 --- a/include/slim.h +++ b/include/slim.h @@ -50,7 +50,7 @@ typedef void slim_t; * Constant definitions *-------------------------------------------------------------------------*/ /* SLIM's version number */ -#define SLIM_VERSION "2.0pre1" +#define SLIM_VERSION "2.0" /* The maximum length of the options[] array */ #define SLIM_NOPTIONS 40 diff --git a/python-package/SLIM/core.py b/python-package/SLIM/core.py index 772fde5..a452c79 100755 --- a/python-package/SLIM/core.py +++ b/python-package/SLIM/core.py @@ -6,6 +6,7 @@ @author: dminerx007 """ +import os import site import time import scipy @@ -516,12 +517,15 @@ def mselect(self, params, trndata, tstdata, arrayl1, arrayl2, nrcmds): raise RuntimeError( 'Something went wrong with model estimation or evaluation when l1=%.4f, l2=%.4f. Please check the input matrix.' % (bestl1HR, bestl2HR)) - def predict(self, data, nrcmds=10, outfile=None): + def predict(self, data, nrcmds=10, outfile=None, negitems=None, nnegs=0, returnscores=False): ''' @brief predict using the learned SLIM model - @params data: a SLIMatrix object to be predicted - nrcmds: number of recommended items for each user - outfile: a filename to dump the topn lists - @return an numpy ndarray of shape (nUsers, nrcmds) + @params data: a SLIMatrix object to be predicted + nrcmds: number of recommended items for each user + outfile: a filename to dump the topn lists + negitems: negative items + nnegs: number of negative items + @return out: an numpy ndarray of shape (nUsers, nrcmds) with recommended item ids + outscores: an numpy ndarray of shape (nUsers, nrcmds) with recommended scores of the corresponding items ''' if self.ismodel != SLIM_OK: raise TypeError("Model not found. Please train a model.") @@ -531,31 +535,89 @@ def predict(self, data, nrcmds=10, outfile=None): # initialize the result matrix res = np.full(data.nUsers * nrcmds, -1, dtype=np.int32) - - rstatus = self._slim_predict( - nrcmds, - self.handle, - data.handle, - res) - + scores = np.zeros(data.nUsers * nrcmds, dtype=np.float32) + + if negitems != None: + assert nnegs >= nrcmds, \ + 'The number of negative items must be larger than the number of items to be recommended.' + + if isinstance(data.user2id, dict): + assert data.user2id.keys() == negitems.keys(), \ + 'The users in the negative items should be the same with the input matrix.' + else: + assert np.array_equal(data.user2id, np.array(sorted(list(negitems.keys())))), \ + 'The users in the negative items should be the same with the input matrix.' + + slim_negitems = np.full(data.nUsers * nnegs, -1, dtype=np.int32) + + newitems = 0 + for key, value in negitems.items(): + assert len(value) == nnegs, \ + 'The number of negative items should match nngs.' + for i in range(nnegs): + try: + slim_negitems[data.user2id[key] * nnegs + i] = self.item2id[value[i]] + except: + newitems += 1 + + if newitems > 0: + print('%d negative items not in the training set.' % (newitems)) + + rstatus = self._slim_predict_1vsk( + nrcmds, + nnegs, + self.handle, + data.handle, + slim_negitems, + res, + scores) + + else: + rstatus = self._slim_predict( + nrcmds, + self.handle, + data.handle, + res, + scores) + if rstatus == SLIM_OK: res = self.id2item[res].reshape(data.nUsers, nrcmds) - out = {} - for key, value in data.user2id.items(): - out[key] = res[value, :] + scores = scores.reshape(data.nUsers, nrcmds) + out = dict() + outscores = dict() + + if isinstance(data.user2id, dict): + for key, value in data.user2id.items(): + out[key] = res[value, :] + outscores[key] = scores[value, :] + else: + for key in data.user2id: + out[key] = res[key, :] + outscores[key] = scores[key, :] if outfile: f = open(outfile, 'w') for key, value in out.items(): f.write(str(key) + ': ' + np.array2string(value, max_line_width=np.inf) + '\n') + if returnscores: + f.write(str(key) + ': ' + np.array2string(outscores[key], + max_line_width=np.inf) + '\n') else: raise RuntimeError( 'Something went wrong during prediction. Please check 1) if the model is estimated correctly; 2) if the input matrix for prediction is correct.') - - return out + + if returnscores: + return out, outscores + else: + return out def save_model(self, modelfname, mapfname): + ''' @brief save the model + @params modelfname: filename to save the model + mapfname: filename to save the item map + @return None + ''' # save the model if there is one if self.ismodel == SLIM_OK: self._slim_save(self.handle, c_char_p(modelfname.encode('utf-8'))) @@ -564,26 +626,62 @@ def save_model(self, modelfname, mapfname): raise RuntimeError("Not exist a model to save.") def load_model(self, modelfname, mapfname): + ''' @brief load a model + @params modelfname: filename of the model + mapfname: filename of the item map + @return None + ''' # if there is a model, destruct the model + if os.path.isfile(modelfname) and os.path.isfile(mapfname): + if self.ismodel == SLIM_OK: + self._slim_free(self.handle) + else: + self.handle = c_void_p() + self.ismodel = self._slim_load( + byref(self.handle), c_char_p(modelfname.encode('utf-8'))) + + try: + self.id2item = np.genfromtxt(mapfname, dtype=np.int32) + except: + self.id2item = np.genfromtxt(mapfname) + self.item2id = {} + for i in range(len(self.id2item)): + self.item2id[self.id2item[i]] = i + self.nItems = len(self.id2item) + + if self.ismodel != SLIM_OK: + raise RuntimeError("Fail to laod the model.") + else: + raise RuntimeError('File does not exist or invalid filename.') + + def to_csr(self, returnmap=False): + ''' @brief export the model as a scipy csr + @params returnmap: return the map or not + @return modelcsr: the model as a scipy csr + itemmap (optional): the item map attached with the model + ''' if self.ismodel == SLIM_OK: - self._slim_free(self.handle) + nnz = c_int(0) + self._slim_stat(self.handle, byref(nnz)) + + indptr = np.zeros(self.nItems + 1, dtype=np.int32) + indices = np.zeros(nnz.value, dtype=np.int32) + data = np.ones(nnz.value, dtype=np.float32) + + self._slim_export(self.handle, indptr, indices, data) + + modelcsr = csr_matrix((data, indices, indptr), shape=(self.nItems, self.nItems)) + + if returnmap: + itemmap = self.id2item[:] + return modelcsr, itemmap + else: + return modelcsr else: - self.handle = c_void_p() - self.ismodel = self._slim_load( - byref(self.handle), c_char_p(modelfname.encode('utf-8'))) - - try: - self.id2item = np.genfromtxt(mapfname, dtype=np.int32) - except: - self.id2item = np.genfromtxt(mapfname) - self.item2id = {} - for i in range(len(self.id2item)): - self.item2id[self.id2item[i]] = i - self.nItems = len(self.id2item) - - if self.ismodel != SLIM_OK: - raise RuntimeError("Fail to laod the model.") - + raise RuntimeError("Not exist a model to export.") + + + def _get_slim(self): ''' @brief wrap up slim functions from c library for python @params None @@ -634,10 +732,26 @@ def _get_slim(self): argtypes=[c_int, # nrcmds c_void_p, # slimhandle c_void_p, # trnhandle - array_1d_int32_t # output + array_1d_int32_t, # output + array_1d_float32_t # scores ] ) - + + # access Py_SLIM_Predict_1vsk from libslim.so + self._slim_predict_1vsk = wrap_function( + slimlib, + "Py_SLIM_Predict_1vsk", + restype=c_int32, # resmat + argtypes=[c_int, # nrcmds + c_int, # nnegs + c_void_p, # slimhandle + c_void_p, # trnhandle + array_1d_int32_t, # negitems + array_1d_int32_t, # output + array_1d_float32_t # scores + ] + ) + # access Py_csr_save from libslim.so self._slim_save = wrap_function( slimlib, @@ -666,3 +780,25 @@ def _get_slim(self): argtypes=[c_void_p # mathandle ] ) + + # access Py_csr_stat from libslim.so + self._slim_stat = wrap_function( + slimlib, + "Py_csr_stat", + restype=c_int32, # flag + argtypes=[c_void_p, # mathandle + c_void_p # nnz + ] + ) + + # access Py_csr_stat from libslim.so + self._slim_export = wrap_function( + slimlib, + "Py_csr_export", + restype=c_int32, # flag + argtypes=[c_void_p, # mathandle + array_1d_int32_t, # indptr + array_1d_int32_t, # indices + array_1d_float32_t # data + ] + ) diff --git a/src/libslim/pyapi.c b/src/libslim/pyapi.c index fbe8a21..1b35733 100644 --- a/src/libslim/pyapi.c +++ b/src/libslim/pyapi.c @@ -75,6 +75,53 @@ int32_t Py_csr_free(slim_t *mathandle) { return SLIM_OK; } +/**************************************************************************/ +/*! @brief get the statistics (nnz) of the csr model + @param mathandle handle to the matrix + nnz number of non-zeros in the model + @return a flag indicating whether the function succeed +*/ +/**************************************************************************/ +int32_t Py_csr_stat(slim_t *mathandle, int32_t *nnz) { + gk_csr_t *mat = (gk_csr_t *)mathandle; + *nnz = mat->rowptr[mat->nrows]; + return SLIM_OK; +} + + +/**************************************************************************/ +/*! @brief export the gk_csr matrix to a scipy csr matrix + @param mathandle handle to the matrix + indptr index pointer of the scipy csr matrix + indices index of the scipy csr matrix + data data of the scipy csr matrix + @return a flag indicating whether the function succeed +*/ +/**************************************************************************/ +int32_t Py_csr_export(slim_t *mathandle, int32_t *indptr, int32_t *indices, float *data) { + int32_t nrows, nnz; + + gk_csr_t *mat = (gk_csr_t *)mathandle; + nrows = mat->nrows; + nnz = mat->rowptr[mat->nrows]; + + for (int i = 0; i < nrows + 1; i++) { + indptr[i] = mat->rowptr[i]; + } + + for (int i = 0; i < nnz; i++) { + indices[i] = mat->rowind[i]; + } + + if (mat->rowval) { + for (int i = 0; i < nnz; i++) { + data[i] = mat->rowval[i]; + } + } + + return SLIM_OK; +} + /**************************************************************************/ /*! @brief estimate a slim model and return the model handle to python @param trnhandle handle to the training matrix diff --git a/src/programs/cmdline_learn.c b/src/programs/cmdline_learn.c index 6199b5e..c2f9ec6 100644 --- a/src/programs/cmdline_learn.c +++ b/src/programs/cmdline_learn.c @@ -73,24 +73,18 @@ static char helpstr[][512] = { " csrnv - CSR format without ratings.", " cluto - Format used by CLUTO.", " ijv - One (row#, col#, val) per line.", - " " + " ", " -binarize", " Specifies that the ratings should be binarized.", " ", " -l1r=double", - " Specifies the L1 regularization parameter. The default value is " - "1.0.", - " ", - " -ipmdlfile=string", - " Specifies the file used to initialize the model.", + " Specifies the L1 regularization parameter. The default value is 1.0.", " ", " -l2r=double", - " Specifies the L2 regularization parameter. The default value is " - "1.0.", + " Specifies the L2 regularization parameter. The default value is 1.0.", " ", " -nnbrs=int", - " Selects FSLIM model and specifies the number of item nearest " - "neighbors", + " Selects FSLIM model and specifies the number of item nearest neighbors", " to be used. The default value is 0.", " ", " -simtype=string", @@ -116,12 +110,13 @@ static char helpstr[][512] = { " ", " -nthreads=int", " Specifies the number of threads to be used for estimation.", - " The default value is maximum number of threads available in the " - "machine.", + " The default value is maximum number of threads available in the machine.", + " ", + " -ipmdlfile=string", + " Specifies the file used to initialize the model.", " ", " -dbglvl=int", - " Specifies the debug level. The default value turns on info and " - "timing.", + " Specifies the debug level. The default value turns on info and timing.", " ", " -help", " Prints this message.", diff --git a/src/programs/cmdline_predict.c b/src/programs/cmdline_predict.c index 5e1535f..3d90f96 100644 --- a/src/programs/cmdline_predict.c +++ b/src/programs/cmdline_predict.c @@ -36,17 +36,27 @@ static gk_StringMap_t ifmt_options[] = { static char helpstr[][512] = { " ", " Usage:", - " slim_predict [options] model-file old-file [test-file]", + " slim_predict [options] model-file old-file [test-file] [neg-file]", " ", " Parameters:", " model-file", " The file that stores the model that was generated by slim_learn.", " ", " old-file", - " The file that stores the historical information for each user.", + " The file that stores the historical information for the users", + " for which recommendations are generated.", " ", " test-file", " The file that stores the hidden items for each user.", + " It is only used to evaluate the quality of the recommendations", + " and it should contain a row for each of the users in the old-file.", + " ", + " neg-file", + " The file that stores the negative items for each user.", + " It is used for evaluation purposes as follows: The hidden items", + " and the negative items are predicted, and the nrcmds highest", + " highest scoring items among them are returned as the recommendations.", + " This is list is then used to evaluate the performance.", " ", " Options:", " -ifmt=string", @@ -55,7 +65,7 @@ static char helpstr[][512] = { " csrnv - CSR format without ratings.", " cluto - Format used by CLUTO.", " ijv - One (row#, col#, val) per line.", - " " + " ", " -binarize", " Specifies that the ratings should be binarized.", " ", @@ -98,6 +108,7 @@ params_t *parse_cmdline(int argc, char *argv[]) { params->binarize = 0; params->outfile = NULL; params->tstfile = NULL; + params->negfile = NULL; params->nrcmds = 10; params->dbglvl = 0; @@ -145,7 +156,7 @@ params_t *parse_cmdline(int argc, char *argv[]) { } /* get the datafile */ - if (argc - gk_optind < 1 || argc - gk_optind > 3) { + if (argc - gk_optind < 1 || argc - gk_optind > 4) { for (int i = 0; strlen(shorthelpstr[i]) > 0; i++) printf("%s\n", shorthelpstr[i]); exit(0); @@ -159,11 +170,17 @@ params_t *parse_cmdline(int argc, char *argv[]) { if (!gk_fexists(params->trnfile)) errexit("Input old file %s does not exist.\n", params->trnfile); - if (argc - gk_optind == 1) { + if (argc - gk_optind >= 1) { params->tstfile = gk_strdup(argv[gk_optind++]); if (!gk_fexists(params->tstfile)) errexit("Input test file %s does not exist.\n", params->tstfile); } + if (argc - gk_optind >= 1) { + params->negfile = gk_strdup(argv[gk_optind++]); + if (!gk_fexists(params->negfile)) + errexit("Input negative file %s does not exist.\n", params->negfile); + } + return params; } diff --git a/src/programs/slim_learn.c b/src/programs/slim_learn.c index 1c87744..3d886a7 100644 --- a/src/programs/slim_learn.c +++ b/src/programs/slim_learn.c @@ -33,12 +33,14 @@ int main(int argc, char *argv[]) { "------------------------------------------------------------------\n"); printf(" trnfile: %s, nrows: %d, ncols: %d, nnz: %zd\n", params->trnfile, tmat->nrows, tmat->ncols, tmat->rowptr[tmat->nrows]); - printf(" l1r: %.2le, l2r: %.2le, optTol: %.2le, niters: %d\n", params->l1r, - params->l2r, params->optTol, params->niters); - printf(" binarize: %d, nnbrs: %d, nthreads: %d, dbglvl: %d\n", - params->binarize, params->nnbrs, params->nthreads, params->dbglvl); - printf(" simtype: %s, mdlfile: %s\n", slim_simtypenames[params->simtype], - params->mdlfile); + printf(" l1r: %.2le, l2r: %.2le, binarize: %s\n", params->l1r, + params->l2r, (params->binarize == 0 ? "No" : "Yes")); + printf(" solver: %s, optTol: %.2le, niters: %d\n", + slim_algonames[params->algo], params->optTol, params->niters); + printf(" mdlfile: %s, nthreads: %d, dbglvl: %d\n", + params->mdlfile, params->nthreads, params->dbglvl); + printf(" simtype: %s, nnbrs: %d\n", + slim_simtypenames[params->simtype], params->nnbrs); printf("\nEstimating model...\n"); /* free any user-supplied ratings if set to be ignored */ diff --git a/src/programs/slim_predict.c b/src/programs/slim_predict.c index 3108ae5..889344a 100644 --- a/src/programs/slim_predict.c +++ b/src/programs/slim_predict.c @@ -14,14 +14,15 @@ /*************************************************************************/ int main(int argc, char *argv[]) { ssize_t zI; - int32_t iU, iR, nrcmds, nhits[3], ntrue[2]; + int32_t i, iU, iR, nrcmds, ask_nrcmds, ncands, nhits[3], ntrue[2]; int32_t nvalid, nvalid_head, nvalid_tail; float all_hr, head_hr, tail_hr; int is_tail_u, is_head_u; int32_t *rids, *rmarker, *fmarker; + gk_fkv_t *rcands, cand; float *rscores, hr[3], arhr, larhr, baseline; params_t *params; - gk_csr_t *oldmat, *tstmat = NULL, *model; + gk_csr_t *oldmat, *tstmat = NULL, *negmat = NULL, *model; int32_t ioptions[SLIM_NOPTIONS]; FILE *fpout = NULL; @@ -34,6 +35,8 @@ int main(int argc, char *argv[]) { oldmat = gk_csr_Read(params->trnfile, params->ifmt, params->readvals, 0); if (params->tstfile) tstmat = gk_csr_Read(params->tstfile, params->ifmt, params->readvals, 0); + if (params->negfile) + negmat = gk_csr_Read(params->negfile, params->ifmt, params->readvals, 0); printf( "------------------------------------------------------------------\n"); @@ -47,6 +50,9 @@ int main(int argc, char *argv[]) { if (tstmat) printf(" tstfile: %s, nrows: %d, ncols: %d, nnz: %zd\n", params->tstfile, tstmat->nrows, tstmat->ncols, tstmat->rowptr[tstmat->nrows]); + if (negmat) + printf(" negfile: %s, nrows: %d, ncols: %d, nnz: %zd\n", params->negfile, + negmat->nrows, negmat->ncols, negmat->rowptr[negmat->nrows]); if (params->outfile) printf(" outfile: %s\n", (params->outfile ? params->outfile : "No output")); @@ -62,18 +68,25 @@ int main(int argc, char *argv[]) { gk_free((void **)&oldmat->rowval, LTERM); if (tstmat) gk_free((void **)&tstmat->rowval, LTERM); + if (negmat) + gk_free((void **)&negmat->rowval, LTERM); } SLIM_iSetDefaults(ioptions); ioptions[SLIM_OPTION_DBGLVL] = params->dbglvl; - /* predict for each row in oldmat */ if (params->outfile) fpout = gk_fopen(params->outfile, "w", "outfile"); - rids = gk_i32malloc(params->nrcmds, "rids"); - rscores = gk_fmalloc(params->nrcmds, "rscores"); + /* if we are using a negative test, ask for a score for all non-supplied items */ + ask_nrcmds = (negmat ? model->nrows : params->nrcmds); + + /* allocate neccessary arrays */ + rids = gk_i32malloc(ask_nrcmds, "rids"); + rscores = gk_fmalloc(ask_nrcmds, "rscores"); rmarker = (tstmat ? gk_i32smalloc(model->ncols, -1, "rmarker") : NULL); + rcands = (negmat ? gk_fkvmalloc(model->ncols, "rcands") : NULL); + // get head and tail columns, mark 0 for head items and 1 for items in tail fmarker = (tstmat ? SLIM_DetermineHeadAndTail( oldmat->nrows, gk_max(oldmat->ncols, tstmat->ncols), @@ -84,12 +97,72 @@ int main(int argc, char *argv[]) { arhr = 0.0; nvalid = nvalid_head = nvalid_tail = 0; + + /* predict for each row in oldmat */ for (iU = 0; iU < oldmat->nrows; iU++) { nrcmds = SLIM_GetTopN( model, oldmat->rowptr[iU + 1] - oldmat->rowptr[iU], oldmat->rowind + oldmat->rowptr[iU], - (oldmat->rowval ? oldmat->rowval + oldmat->rowptr[iU] : NULL), ioptions, - params->nrcmds, rids, rscores); + (oldmat->rowval ? oldmat->rowval + oldmat->rowptr[iU] : NULL), + ioptions, ask_nrcmds, rids, rscores); + + /* if negative test items, select the params->nrcmds from neg+pos test */ + if (negmat && nrcmds != SLIM_ERROR) { + for (zI = tstmat->rowptr[iU]; zI < tstmat->rowptr[iU + 1]; zI++) + rmarker[tstmat->rowind[zI]] = -2; + for (zI = negmat->rowptr[iU]; zI < negmat->rowptr[iU + 1]; zI++) + rmarker[negmat->rowind[zI]] = -2; + + /* select the neg+pos that were in the recommended list */ + for (ncands=0, iR=0; iRrowptr[iU]; zI < tstmat->rowptr[iU + 1]; zI++) { + if (rmarker[tstmat->rowind[zI]] != -3) { + rcands[ncands].val = tstmat->rowind[zI]; + rcands[ncands].key = 0.0; + ncands++; + } + rmarker[tstmat->rowind[zI]] = -1; + } + for (zI = negmat->rowptr[iU]; zI < negmat->rowptr[iU + 1]; zI++) { + if (rmarker[negmat->rowind[zI]] != -3) { + rcands[ncands].val = negmat->rowind[zI]; + rcands[ncands].key = 0.0; + ncands++; + } + rmarker[negmat->rowind[zI]] = -1; + } + //printf("ncands: %5d,", ncands); + + + /* shuffle prior to sorting */ + for (iR=0; iRnrcmds); + for (iR=0; iR 0 ? 1.0 * nhits[0] / ntrue[0] : 0.0); @@ -171,9 +245,11 @@ int main(int argc, char *argv[]) { "------------------------------------------------------------------\n"); /* clean up */ - gk_free((void **)&rids, &rscores, &rmarker, &fmarker, LTERM); + gk_free((void **)&rids, &rscores, &rmarker, &fmarker, &rcands, LTERM); SLIM_FreeModel((slim_t **)&model); gk_csr_Free(&oldmat); if (tstmat) gk_csr_Free(&tstmat); + if (negmat) + gk_csr_Free(&negmat); } diff --git a/src/programs/struct.h b/src/programs/struct.h index 6b22bce..bf82693 100644 --- a/src/programs/struct.h +++ b/src/programs/struct.h @@ -17,6 +17,7 @@ the University of Minnesota typedef struct { char *trnfile; /*!< the file of historical preferences */ char *tstfile; /*!< the file to validate the recommendations */ + char *negfile; /*!< the file containing the negative test instances */ char *l12file; /*!< the file that contains the regularization values over which to search */ char *mdlfile; /*!< the model file during prediction */