Update for version 1.4.7

Fix numpy version incompatibility, which caused FTRL model serialization errors with newer Numpy versions. Add apply_groupby.py, which distributes Pandas groupby calls
anttttti · May 20, 2021 · 6299a37 · 6299a37
1 parent e2be83b
commit 6299a37
Show file tree

Hide file tree

Showing 9 changed files with 62 additions and 28 deletions.
diff --git a/README.rst b/README.rst
@@ -1,5 +1,5 @@
 ===============
-Wordbatch 1.4.6
+Wordbatch 1.4.7
 ===============
 
 Overview

diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
 
 setup(
 	name='Wordbatch',
-	version='1.4.6',
+	version='1.4.7',
 	description='Python library for distributed AI processing pipelines, using swappable scheduler backends',
 	url='https://github.com/anttttti/Wordbatch',
 	author='Antti Puurula',

diff --git a/wordbatch/__init__.py b/wordbatch/__init__.py
@@ -1,4 +1,4 @@
 import os
 PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))
-__version__ = '1.4.6'
+__version__ = '1.4.7'
 
diff --git a/wordbatch/models/fm_ftrl.pyx b/wordbatch/models/fm_ftrl.pyx
@@ -90,12 +90,12 @@ cdef void update_single(int* inds, double* vals, int lenn, double e, double ialp
 		n_fm[i] += e2
 
 cdef class FM_FTRL:
-	cdef double[:] w
-	cdef double[:] z
-	cdef double[:] n
-	cdef double[:] w_fm
-	cdef double[:] z_fm
-	cdef double[:] n_fm
+	cdef const double[:] w
+	cdef const double[:] z
+	cdef const double[:] n
+	cdef const double[:] w_fm
+	cdef const double[:] z_fm
+	cdef const double[:] n_fm
 
 	cdef unsigned int threads
 	cdef unsigned int iters

diff --git a/wordbatch/models/ftrl.pyx b/wordbatch/models/ftrl.pyx
@@ -65,9 +65,9 @@ cdef void update_single(int* inds, double* vals, int lenn, double e, double ialp
 		n[i]+= g2
 
 cdef class FTRL:
-	cdef double[:] w
-	cdef double[:] z
-	cdef double[:] n
+	cdef const double[:] w
+	cdef const double[:] z
+	cdef const double[:] n
 
 	cdef unsigned int threads
 	cdef unsigned int iters

diff --git a/wordbatch/models/ftrl32.pyx b/wordbatch/models/ftrl32.pyx
@@ -65,9 +65,9 @@ cdef void update_single(int* inds, double* vals, int lenn, double e, double ialp
 		n[i]+= g2
 
 cdef class FTRL32:
-	cdef float[:] w
-	cdef float[:] z
-	cdef float[:] n
+	cdef const float[:] w
+	cdef const float[:] z
+	cdef const float[:] n
 
 	cdef unsigned int threads
 	cdef unsigned int iters

diff --git a/wordbatch/models/nn_relu_h1.pyx b/wordbatch/models/nn_relu_h1.pyx
@@ -54,11 +54,11 @@ cdef void update_single(int* inds, double* vals, int lenn, int D, int D_nn, doub
 		c1[j]+= fabs(dldw1)
 
 cdef class NN_ReLU_H1:
-	cdef double[:] w0
-	cdef double[:] w1
-	cdef double[:] z
-	cdef double[:] c0
-	cdef double[:] c1
+	cdef const double[:] w0
+	cdef const double[:] w1
+	cdef const double[:] z
+	cdef const double[:] c0
+	cdef const double[:] c1
 
 	cdef unsigned int threads
 	cdef unsigned int iters

diff --git a/wordbatch/models/nn_relu_h2.pyx b/wordbatch/models/nn_relu_h2.pyx
@@ -65,14 +65,14 @@ cdef void update_single(int* inds, double* vals, int lenn,  int D,  int D_nn,  i
 		c2[k] += fabs(dldw2)
 
 cdef class NN_ReLU_H2:
-	cdef double[:] w0
-	cdef double[:] w1
-	cdef double[:] w2
-	cdef double[:] z1
-	cdef double[:] z2
-	cdef double[:] c0
-	cdef double[:] c1
-	cdef double[:] c2
+	cdef const double[:] w0
+	cdef const double[:] w1
+	cdef const double[:] w2
+	cdef const double[:] z1
+	cdef const double[:] z2
+	cdef const double[:] c0
+	cdef const double[:] c1
+	cdef const double[:] c2
 
 	cdef unsigned int threads
 	cdef unsigned int iters

diff --git a/wordbatch/pipelines/apply_groupby.py b/wordbatch/pipelines/apply_groupby.py
@@ -0,0 +1,34 @@
+import pandas as pd
+from wordbatch.pipelines import Apply
+
+class ApplyGroupBy(object):
+	def __init__(self, batcher, function, group, rows_per_bin= 200, cache=None, vectorize=None, args=[], kwargs={}):
+		self.batcher= batcher
+		self.function= function
+		self.group= group
+		self.rows_per_bin = rows_per_bin
+		self.cache= cache
+		self.vectorize= vectorize
+		self.args= [args]
+		self.kwargs= [kwargs]
+
+	def fit(self, data, input_split= False):
+		return self
+
+	def fit_transform(self, data, input_split= False, merge_output= True):
+		return self.transform(data, input_split, merge_output)
+
+	def transform(self, data, input_split= False, merge_output= True):
+		bin_ids = data[self.group].unique()
+		group_bins= {x:1 for x in bin_ids} if len(bin_ids) <= self.rows_per_bin else \
+			{x[0]: x[1] for x in zip(bin_ids, pd.qcut(bin_ids, len(bin_ids) // self.rows_per_bin))}
+		group_bin_col = data[self.group].map(group_bins)
+		bin_ids, groups = zip(*data.groupby(group_bin_col, as_index=False))
+		t= [x for x in Apply(self.function, self.batcher, *self.args, *self.kwargs, self.cache,
+							 self.vectorize).transform(groups, input_split, merge_output)
+			if len(x) > 0]
+		try:
+			t= pd.concat(t, sort=False) # t is Series or DataFrame
+		except:
+			t= [item for sublist in t for item in sublist] # t is some iterable
+		return t