-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining_pipeline.py
47 lines (35 loc) · 1.5 KB
/
training_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os
from dotenv import load_dotenv
from my_hopsworks.connector import FeatureStoreManager
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump
load_dotenv()
hopsworks_api_key_value: str = os.environ.get("HOPSWORKS_API_KEY_VALUE")
manager = FeatureStoreManager(hopsworks_api_key_value)
users_fg = manager.get_feature_group("users", version=1)
query = users_fg.select_except(["id", "email", "name", "address", "telephone", "updated_at", "created_at", "average_age"])
df = query.read()
numerical_cols = ['age', 'weight'] # Add other numerical columns as necessary
categorical_cols = ['gender', 'height', 'language', 'blood_type', 'nationality', 'academic_degree'] # Add other categorical columns as necessary
# Column Transformer for preprocessing
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_cols),
('cat', OneHotEncoder(), categorical_cols)
])
# Clustering pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('cluster', KMeans(n_clusters=5))])
# Fit the model
pipeline.fit(df)
# Assuming your model pipeline is named `pipeline`
model_filename = 'models/clustering_model.pkl'
dump(pipeline, model_filename)
model = manager.mr.python.create_model(
name="clustering_model",
description="Clustering Model"
)
model.save('models/clustering_model.pkl')