-
Notifications
You must be signed in to change notification settings - Fork 3
/
train_xgboost.py
67 lines (57 loc) · 2.46 KB
/
train_xgboost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import argparse
import os
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from time import time
import joblib
def preprocess_data(df):
"""Preprocess the Titanic dataset for model training."""
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns=["Name", "Ticket", "Cabin"], inplace=True)
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
X = df.drop(columns=["Survived"])
y = df["Survived"]
return X, y
def train_model(dtrain, params, num_round):
"""Train the XGBoost model with the specified parameters."""
start_time = time()
model = xgb.train(params, dtrain, num_boost_round=num_round)
print(f"Training time: {time() - start_time:.2f} seconds")
return model
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--train", type=str, default="train.csv", help="Path to training data CSV")
parser.add_argument("--max_depth", type=int, default=5)
parser.add_argument("--eta", type=float, default=0.1)
parser.add_argument("--subsample", type=float, default=0.8)
parser.add_argument("--colsample_bytree", type=float, default=0.8)
parser.add_argument("--num_round", type=int, default=100)
args = parser.parse_args()
# Detect if running on SageMaker or locally
input_data_path = os.path.join(os.environ.get("SM_CHANNEL_TRAIN", "."), args.train)
output_data_path = os.path.join(os.environ.get("SM_MODEL_DIR", "."), "xgboost-model")
# Load and preprocess data
df = pd.read_csv(input_data_path)
X, y = preprocess_data(df)
# Split data for evaluation purposes
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Train size: {X_train.shape}')
print(f'Val size: {X_val.shape}')
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
# Set up XGBoost parameters for training
params = {
"objective": "binary:logistic",
"max_depth": args.max_depth,
"eta": args.eta,
"subsample": args.subsample,
"colsample_bytree": args.colsample_bytree,
"eval_metric": "logloss",
}
# Train the model
model = train_model(dtrain, params, args.num_round)
joblib.dump(model, output_data_path)
print(f"Model saved to {output_data_path}")