-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsms_spam_prediction_using_bert.py
117 lines (81 loc) · 3.24 KB
/
sms_spam_prediction_using_bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-
"""SMS Spam Prediction Using BERT.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1uB43Fb7M0N-C0JnFKZOFjCtUVnhZS_UW
"""
!pip3 install --quiet tensorflow
!pip3 install --quiet tensorflow_text
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from google.colab import drive
drive.mount('/content/drive')
import csv
with open('/content/drive/MyDrive/Datasets/spam.csv', newline='', encoding = "ISO-8859-1") as f:
reader = csv.reader(f)
for row in reader:
print(row)
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Datasets/spam.csv', encoding = "ISO-8859-1")
df.head(5)
df.groupby('v1').describe()
df['v1'].value_counts()
df_spam = df[df['v1']=='spam']
df_spam.shape
df_ham = df[df['v1']=='ham']
df_ham.shape
df_ham_downsampled = df_ham.sample(df_spam.shape[0])
df_ham_downsampled.shape
df_balanced = pd.concat([df_ham_downsampled, df_spam])
df_balanced.shape
df_balanced['v1'].value_counts()
df_balanced['spam']=df_balanced['v1'].apply(lambda x: 1 if x=='spam' else 0)
df_balanced.sample(5)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced['v2'],df_balanced['spam'], stratify=df_balanced['spam'])
X_train.head(5)
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])
model.summary()
METRICS = [
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall')
]
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=METRICS)
model.fit(X_train, y_train, epochs=10)
model.evaluate(X_test, y_test)
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()
import numpy as np
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_predicted)
cm
from matplotlib import pyplot as plt
import seaborn as sn
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')
print(classification_report(y_test, y_predicted))
reviews = [
'Hey, are you comming univarsity tomorrow?',
'Enter a chance to win $5000, hurry up, offer valid until march 31, 2021',
'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p p£3.99',
'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
"Why don't you wait 'til at least wednesday to see if you get your ."
]
model.predict(reviews)