-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrainer.py
55 lines (44 loc) · 2.2 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.utils import compute_class_weight
# Load the CSV dataset
data = pd.read_csv("clean_sql_dataset.csv")
# Separate queries and labels
queries = data["Query"].tolist()
labels = data["Label"].tolist()
# Tokenization and padding
tokenizer = Tokenizer(num_words=5000) # Limit vocabulary size for better generalization
tokenizer.fit_on_texts(queries)
sequences = tokenizer.texts_to_sequences(queries)
padded_sequences = pad_sequences(sequences, maxlen=100, padding="post") # Limit sequence length
# Convert labels to numpy arrays
y_labels = np.array(labels)
# Split data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, y_labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
# Calculate class weights for handling class imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train).tolist()
# Build a more complex model architecture
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=100),
tf.keras.layers.Conv1D(128, 5, activation='relu'),
tf.keras.layers.GlobalMaxPooling1D(),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
metrics=["accuracy"])
# Train the model with class weights and validation
model.fit(X_train, y_train, epochs=15, batch_size=64, validation_data=(X_val, y_val),
class_weight=dict(enumerate(class_weights)))
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}")
# Save the trained model in SavedModel format
model.save("sql_injection_detection_model")