diff --git a/dataset/dataset.go b/dataset/dataset.go index aba119c66..6a571c123 100644 --- a/dataset/dataset.go +++ b/dataset/dataset.go @@ -18,6 +18,7 @@ import ( "time" "github.com/chewxy/math32" + mapset "github.com/deckarep/golang-set/v2" "github.com/samber/lo" "github.com/zhenghaoz/gorse/storage/data" "modernc.org/strutil" @@ -35,6 +36,7 @@ type Dataset struct { itemFeedback [][]ID userDict *FreqDict itemDict *FreqDict + categories mapset.Set[string] } func NewDataset(timestamp time.Time, userCount, itemCount int) *Dataset { @@ -48,6 +50,7 @@ func NewDataset(timestamp time.Time, userCount, itemCount int) *Dataset { itemFeedback: make([][]ID, itemCount), userDict: NewFreqDict(), itemDict: NewFreqDict(), + categories: mapset.NewSet[string](), } } @@ -71,6 +74,10 @@ func (d *Dataset) GetItemFeedback() [][]ID { return d.itemFeedback } +func (d *Dataset) GetCategories() []string { + return d.categories.ToSlice() +} + // GetUserIDF returns the IDF of users. // // IDF(u) = log(I/freq(u)) @@ -145,6 +152,7 @@ func (d *Dataset) AddItem(item data.Item) { if len(d.itemFeedback) < len(d.items) { d.itemFeedback = append(d.itemFeedback, nil) } + d.categories.Append(item.Categories...) } func (d *Dataset) AddFeedback(userId, itemId string) { diff --git a/master/tasks.go b/master/tasks.go index b73ce668f..13947fba7 100644 --- a/master/tasks.go +++ b/master/tasks.go @@ -171,7 +171,7 @@ func (m *Master) runLoadDatasetTask() error { InactiveItemsTotal.Set(float64(inactiveItems)) // write categories to cache - if err = m.CacheClient.SetSet(ctx, cache.ItemCategories, rankingDataset.CategorySet.ToSlice()...); err != nil { + if err = m.CacheClient.SetSet(ctx, cache.ItemCategories, dataSet.GetCategories()...); err != nil { log.Logger().Error("failed to write categories to cache", zap.Error(err)) } @@ -669,10 +669,6 @@ func (m *Master) LoadDataFromDatabase( temp := time.Now().AddDate(0, 0, -int(positiveFeedbackTTL)) feedbackTimeLimit = data.WithBeginTime(temp) } - timeWindowLimit := time.Time{} - if m.Config.Recommend.Popular.PopularWindow > 0 { - timeWindowLimit = time.Now().Add(-m.Config.Recommend.Popular.PopularWindow) - } rankingDataset = ranking.NewMapIndexDataset() // STEP 1: pull users @@ -742,9 +738,6 @@ func (m *Master) LoadDataFromDatabase( itemIndex := rankingDataset.ItemIndex.ToNumber(item.ItemId) if len(rankingDataset.ItemFeatures) == int(itemIndex) { rankingDataset.ItemFeatures = append(rankingDataset.ItemFeatures, nil) - rankingDataset.HiddenItems = append(rankingDataset.HiddenItems, false) - rankingDataset.ItemCategories = append(rankingDataset.ItemCategories, item.Categories) - rankingDataset.CategorySet.Append(item.Categories...) } features := click.ConvertLabelsToFeatures(item.Labels) rankingDataset.NumItemLabelUsed += len(features) @@ -772,9 +765,6 @@ func (m *Master) LoadDataFromDatabase( }) } } - if item.IsHidden { // set hidden flag - rankingDataset.HiddenItems[itemIndex] = true - } dataSet.AddItem(item) } span.Add(len(batchItems)) @@ -790,7 +780,6 @@ func (m *Master) LoadDataFromDatabase( LoadDatasetStepSecondsVec.WithLabelValues("load_items").Set(time.Since(start).Seconds()) // create positive set - popularCount := make([]int32, rankingDataset.ItemCount()) positiveSet := make([]mapset.Set[int32], rankingDataset.UserCount()) for i := range positiveSet { positiveSet[i] = mapset.NewSet[int32]() @@ -835,10 +824,6 @@ func (m *Master) LoadDataFromDatabase( posFeedbackCount++ // insert feedback to ranking dataset rankingDataset.AddFeedback(f.UserId, f.ItemId, false) - // insert feedback to popularity counter - if f.Timestamp.After(timeWindowLimit) && !rankingDataset.HiddenItems[itemIndex] { - popularCount[itemIndex]++ - } // insert feedback to evaluator evaluator.Positive(f.FeedbackType, userIndex, itemIndex, f.Timestamp) mu.Unlock() diff --git a/model/ranking/data.go b/model/ranking/data.go index dc9850100..58748487d 100644 --- a/model/ranking/data.go +++ b/model/ranking/data.go @@ -31,18 +31,15 @@ import ( // DataSet contains preprocessed data structures for recommendation models. type DataSet struct { - UserIndex base.Index - ItemIndex base.Index - FeedbackUsers base.Array[int32] - FeedbackItems base.Array[int32] - UserFeedback [][]int32 - ItemFeedback [][]int32 - Negatives [][]int32 - ItemFeatures [][]lo.Tuple2[int32, float32] - UserFeatures [][]lo.Tuple2[int32, float32] - HiddenItems []bool - ItemCategories [][]string - CategorySet mapset.Set[string] + UserIndex base.Index + ItemIndex base.Index + FeedbackUsers base.Array[int32] + FeedbackItems base.Array[int32] + UserFeedback [][]int32 + ItemFeedback [][]int32 + Negatives [][]int32 + ItemFeatures [][]lo.Tuple2[int32, float32] + UserFeatures [][]lo.Tuple2[int32, float32] // statistics NumItemLabels int32 NumUserLabels int32 @@ -53,7 +50,6 @@ type DataSet struct { // NewMapIndexDataset creates a data set. func NewMapIndexDataset() *DataSet { s := new(DataSet) - s.CategorySet = mapset.NewSet[string]() // Create index s.UserIndex = base.NewMapIndex() s.ItemIndex = base.NewMapIndex() @@ -179,9 +175,6 @@ func (dataset *DataSet) Split(numTestUsers int, seed int64) (*DataSet, *DataSet) trainSet, testSet := new(DataSet), new(DataSet) trainSet.NumItemLabels, testSet.NumItemLabels = dataset.NumItemLabels, dataset.NumItemLabels trainSet.NumUserLabels, testSet.NumUserLabels = dataset.NumUserLabels, dataset.NumUserLabels - trainSet.HiddenItems, testSet.HiddenItems = dataset.HiddenItems, dataset.HiddenItems - trainSet.ItemCategories, testSet.ItemCategories = dataset.ItemCategories, dataset.ItemCategories - trainSet.CategorySet, testSet.CategorySet = dataset.CategorySet, dataset.CategorySet trainSet.ItemFeatures, testSet.ItemFeatures = dataset.ItemFeatures, dataset.ItemFeatures trainSet.UserFeatures, testSet.UserFeatures = dataset.UserFeatures, dataset.UserFeatures trainSet.NumItemLabelUsed, testSet.NumItemLabelUsed = dataset.NumItemLabelUsed, dataset.NumItemLabelUsed