Merge pull request #2 from k8gb-io/handle-zero-records

Handling zero PDF items
k8gb-io · Aug 25, 2022 · 5fdf4f1 · 5fdf4f1
2 parents a725274 + abd9519
commit 5fdf4f1
Show file tree

Hide file tree

Showing 4 changed files with 138 additions and 33 deletions.
diff --git a/.github/workflows/tag.yaml b/.github/workflows/tag.yaml
@@ -39,5 +39,5 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           RELEASE_BRANCHES: main
           WITH_V: true
-          DEFAULT_BUMP: patch
+          DEFAULT_BUMP: minor
 #          PRERELEASE_SUFFIX: beta
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ go get github.com/k8gb-io/go-weight-shuffling
 // pdf requires to be 100% in total.  
 pdf := []int{30, 40, 20, 10}
 // handle error in real code
-ws, _ := NewWS(pdf)
+ws, _ := gows.NewWS(pdf)
 // the index is selected from the probability determined by the pdf 
 index := ws.Pick()
 ```
@@ -42,15 +42,15 @@ while it will return 0 or 2 in about 10 out of 100 cases.
 
 **The only condition is that the sum of all values in the PDF is always equal to 100!**
 
-## PickVector() Usage
+## PickVector(Settings) Usage
 
 ```go
 // pdf requires to be 100% in total.  
 pdf := []int{30, 40, 20, 10}
 // handle error in real code
-ws, _ := NewWS(pdf)
+ws, _ := gows.NewWS(pdf)
 // the result will be slices of the index, which will be "probably" sorted by probability
-indexes := wrr.PickVector()
+indexes := wrr.PickVector(gows.KeepIndexesForZeroPDF)
 ```
 
 A bit more complex case is when you need to shuffle the indexes in the array to match the PDF instead of one element.
@@ -66,6 +66,17 @@ vector. For example, for `PDF={30,40,20,10}` the result will be like this:
 the function returns an index slice such that index0 will be represented in the zero position in about 30% of cases,
 index1 will be in the first position in about 40% of cases, etc.
 
+### PickVector settings argument
+The Settings argument defines how the PickVector function will return indexes. Imagine you have 
+a PDF for three different parts and you set one of them to 0 (just turn it off, because the 
+probability of this index will be 0). The solution is not universal, each use-case requires 
+different behavior. Currently we define two versions of the behavior.
+
+- `KeepIndexesForZeroPDF` keeps indexes for zero pdf elements; e.g: for `pdf=[0,50,50,0,0,0]` returns `[1,2,0,3,4,5]` or `[2,1,0,3,4,5]`
+- `IgnoreIndexesForZeroPDF` filter indexes for zero pdf elements; e.g: for `pdf=[0,50,50,0,0,0]` returns `[1,2]` or `[2,1]`
+
+Translated with www.DeepL.com/Translator (free version)
+
 ## Examples
 This library is ideal for Weight RoundRobin. Imagine you need to balance these addresses (can be applied to whole groups
 of addresses):

diff --git a/gows/ws.go b/gows/ws.go
@@ -22,54 +22,70 @@ import (
 
 // WS Weight Round Robin Alghoritm
 type WS struct {
-	pdf      []int
-	index100 int
+	pdf                         touples
+	filteredPDFFromZeroElements touples
 }
 
+type touples []struct {
+	index      int
+	percentage int
+}
+
+//The Settings argument defines how the PickVector function will return indexes
+type Settings int
+
+const (
+	// KeepIndexesForZeroPDF keeps indexes for zero pdf elements.
+	// e.g: for pdf=[0,50,50,0,0,0] may return [1,2,0,3,4,5] or [2,1,0,3,4,5]
+	KeepIndexesForZeroPDF Settings = iota
+
+	// IgnoreIndexesForZeroPDF filter indexes for zero pdf elements.
+	// e.g: for pdf=[0,50,50,0,0,0] may return [1,2] or [2,1]
+	IgnoreIndexesForZeroPDF
+)
+
 // NewWS instantiate weight round robin
 func NewWS(pdf []int) (wrr *WS, err error) {
 	r := 0
-	max100 := -1
+	wrr = new(WS)
 	for i, v := range pdf {
-		if v == 100 {
-			max100 = i
-		}
 		r += v
 		if v < 0 || v > 100 {
 			return wrr, fmt.Errorf("value %v out of range [0;100]", v)
 		}
+		t := struct {
+			index      int
+			percentage int
+		}{i, v}
+		if v != 0 {
+			wrr.filteredPDFFromZeroElements = append(wrr.filteredPDFFromZeroElements, t)
+		}
+		wrr.pdf = append(wrr.pdf, t)
 	}
 	if r != 100 {
 		return wrr, fmt.Errorf("sum of pdf elements must be equal to 100 perent")
 	}
 	rand.Seed(time.Now().UnixNano())
-	wrr = new(WS)
-	wrr.pdf = pdf
-	wrr.index100 = max100
 	return wrr, nil
 }
 
 // PickVector returns slice shuffled by pdf distribution.
 // The item with the highest probability will occur more often
 // at the position that has the highest probability in the PDF
 // see README.md
-func (w *WS) PickVector() (indexes []int) {
-	if w.index100 != -1 {
-		return w.handle100()
-	}
-
-	pdf := make([]int, len(w.pdf))
-	copy(pdf, w.pdf)
+func (w *WS) PickVector(settings Settings) (indexes []int) {
+	pdf := make(touples, len(w.filteredPDFFromZeroElements))
+	copy(pdf, w.filteredPDFFromZeroElements)
 	balance := 100
 	for i := 0; i < len(pdf); i++ {
 		cdf := w.getCDF(pdf)
 		index := w.pick(cdf, balance)
 		indexes = append(indexes, index)
 
-		balance -= pdf[index]
-		pdf[index] = 0
+		balance -= pdf[index].percentage
+		pdf[index].percentage = 0
 	}
-	return indexes
+	return w.indexes(settings, indexes)
 }
 
 // Pick returns one index with probability given by pdf
@@ -80,32 +96,42 @@ func (w *WS) Pick() int {
 }
 
 // pick one index
-func (w *WS) pick(cdf []int, n int) int {
+func (w *WS) pick(cdf touples, n int) int {
 	r := rand.Intn(n)
 	index := 0
-	for r >= cdf[index] {
+	for r >= cdf[index].percentage {
 		index++
 	}
 	return index
 }
 
-func (w *WS) getCDF(pdf []int) (cdf []int) {
+func (w *WS) getCDF(pdf touples) (cdf touples) {
 	// prepare cdf
 	for i := 0; i < len(pdf); i++ {
-		cdf = append(cdf, 0)
+		cdf = append(cdf, struct {
+			index      int
+			percentage int
+		}{index: 0, percentage: 0})
 	}
 	cdf[0] = pdf[0]
 	for i := 1; i < len(pdf); i++ {
-		cdf[i] = cdf[i-1] + pdf[i]
+		cdf[i].percentage = cdf[i-1].percentage + pdf[i].percentage
 	}
 	return cdf
 }
 
-// there is no reason to calculate CDF and recompute PDF's if some field has 100%
-func (w *WS) handle100() (indexes []int) {
+func (w *WS) indexes(settings Settings, calculatedIndexes []int) (indexes []int) {
+	if settings == IgnoreIndexesForZeroPDF {
+		for _, v := range calculatedIndexes {
+			indexes = append(indexes, w.filteredPDFFromZeroElements[v].index)
+		}
+		return indexes
+	}
 	for i := 0; i < len(w.pdf); i++ {
 		indexes = append(indexes, i)
 	}
-	indexes[0], indexes[w.index100] = indexes[w.index100], indexes[0]
+	for i, v := range calculatedIndexes {
+		indexes[i], indexes[w.filteredPDFFromZeroElements[v].index] = indexes[w.filteredPDFFromZeroElements[v].index], indexes[i]
+	}
 	return indexes
 }
diff --git a/gows/ws_test.go b/gows/ws_test.go
@@ -37,6 +37,12 @@ func TestInit(t *testing.T) {
 		{"hundred", []int{0, 0, 100}, true},
 		{"hundred", []int{100, 0}, true},
 		{"hundred", []int{100}, true},
+		{"50 50 0", []int{50, 50, 0}, true},
+		{"50 50 0 0", []int{50, 50, 0, 0}, true},
+		{"50 0 50 0", []int{50, 0, 50, 0}, true},
+		{"0 50 0 50", []int{0, 50, 0, 50}, true},
+		{"50 0 0 50", []int{50, 0, 0, 50}, true},
+		{"0 0 50 0 0 50 0 0", []int{0, 0, 50, 0, 0, 50, 0, 0}, true},
 	}
 	for _, test := range tests {
 		t.Run(fmt.Sprintf("%s: %v", test.name, test.pdf), func(t *testing.T) {
@@ -67,6 +73,7 @@ func TestPick(t *testing.T) {
 		{"multiple zeros", []int{100, 0, 0}, 0},
 		{"multiple zeros", []int{0, 100, 0}, 0},
 		{"multiple zeros", []int{0, 0, 100}, 0},
+		{"50 50 0", []int{50, 50, 0}, 20},
 	}
 
 	for _, test := range tests {
@@ -104,6 +111,12 @@ func TestPickVector(t *testing.T) {
 		{"multiple zeros", []int{0, 100, 0}, 0},
 		{"multiple zeros", []int{0, 0, 100}, 0},
 		{"multiple zeros", []int{0, 0, 0, 100, 0, 0}, 0},
+		{"50 50 0", []int{50, 50, 0}, 5},
+		{"50 50 0 0", []int{50, 50, 0, 0}, 5},
+		{"50 0 50 0", []int{50, 0, 50, 0}, 5},
+		{"0 50 0 50", []int{0, 50, 0, 50}, 5},
+		{"50 0 0 50", []int{50, 0, 0, 50}, 5},
+		{"0 0 50 0 0 50 0 0", []int{0, 0, 50, 0, 0, 50, 0, 0}, 5},
 	}
 	for _, test := range tests {
 		t.Run(fmt.Sprintf("%s: %v", test.name, test.pdf), func(t *testing.T) {
@@ -117,7 +130,7 @@ func TestPickVector(t *testing.T) {
 
 			for i := 0; i < n; i++ {
 
-				indexes := wrr.PickVector()
+				indexes := wrr.PickVector(KeepIndexesForZeroPDF)
 				for _, v := range indexes {
 					assert.True(t, v >= 0 && v < len(test.pdf), "Pick returned index out of range")
 				}
@@ -140,6 +153,61 @@ func TestPickVector(t *testing.T) {
 	}
 }
 
+func TestSettings(t *testing.T) {
+	tests := []struct {
+		name                string
+		pdf                 []int
+		expectedIndexValues []int
+		settings            Settings
+	}{
+		{"happy distribution -Ignore", []int{30, 40, 20, 10}, []int{0, 1, 2, 3}, IgnoreIndexesForZeroPDF},
+		{"happy distribution - Keep", []int{30, 40, 20, 10}, []int{0, 1, 2, 3}, KeepIndexesForZeroPDF},
+
+		{"one element  - Ignore", []int{100}, []int{0}, 0},
+		{"one element  - Keep", []int{100}, []int{0}, 0},
+
+		{"one zero - Ignore", []int{100, 0}, []int{0}, IgnoreIndexesForZeroPDF},
+		{"one zero - Keep ", []int{100, 0}, []int{0, 1}, KeepIndexesForZeroPDF},
+		{"0 100 0 - Ignore", []int{0, 100, 0}, []int{1}, IgnoreIndexesForZeroPDF},
+		{"0 100 0 - Keep ", []int{0, 100, 0}, []int{0, 1, 2}, KeepIndexesForZeroPDF},
+
+		{"0 50 0 50 - Ignore ", []int{0, 50, 0, 50}, []int{1, 3}, IgnoreIndexesForZeroPDF},
+		{"0 50 0 50 - Keep ", []int{0, 50, 0, 50}, []int{0, 1, 2, 3}, KeepIndexesForZeroPDF},
+
+		{"0 0 50 0 0 50 0 0 - Ignore", []int{0, 0, 50, 0, 0, 50, 0, 0}, []int{2, 5}, IgnoreIndexesForZeroPDF},
+		{"0 0 50 0 0 50 0 0 - Keep", []int{0, 0, 50, 0, 0, 50, 0, 0}, []int{0, 1, 2, 3, 4, 5, 6, 7}, KeepIndexesForZeroPDF},
+	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			wrr, err := NewWS(test.pdf)
+			require.NoError(t, err)
+			for x := 0; x < 2; x++ {
+				indexes := wrr.PickVector(test.settings)
+				assert.True(t, containsValues(indexes, test.expectedIndexValues), "%v %v", indexes, test.expectedIndexValues)
+			}
+		})
+	}
+}
+
+// slice a contains same values as defined in slice b.
+// the values could be in different order but must be present in both slices
+func containsValues(a, b []int) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	m := make(map[int]int, len(a))
+	for i := 0; i < len(a); i++ {
+		m[a[i]]++
+		m[b[i]]++
+	}
+	for _, v := range m {
+		if v != 2 {
+			return false
+		}
+	}
+	return true
+}
+
 func sum(result []int) (sum int) {
 	for _, v := range result {
 		sum += v