-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathspam_filter.py
117 lines (99 loc) · 3.92 KB
/
spam_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
from sklearn.feature_extraction.text import CountVectorizer
# 贝叶斯公式 P(A|B)=P(B|A)*P(A)/P(B)
# 利用贝叶斯公式进行邮件筛选
# A:收到垃圾邮件 B:邮件含有某个词语
# P(A|B):在含有该词语的条件下A为垃圾的概率
# P(B|A):垃圾邮件中出现该词的概率
# P(A):收到垃圾邮件的概率,一般为0.5
# P(B):在所有邮件中该词语出现的概率
# 联合概率计算公式 P=P1*P2...Pm/P1*P2*...Pn+(1-P1)*(1-P2)...(1-Pn)
# 利用贝叶斯公式计算出每一个词对应的条件概率后,再代入联合概率公式
# 将text中的标点符号和数字过滤,小写化
def Filter_text(text):
str = re.sub('[^a-zA-Z]', ' ', text)
str = re.sub(r'\s+', ' ', str)
# print(str)
return str.lower()
# 统计垃圾邮件和健康邮件的词频
def Count(text):
vectorizer = CountVectorizer()
L = ['']
L[0] = text
weight = vectorizer.fit_transform(L).toarray()
word = vectorizer.get_feature_names() # 所有文本的关键字
print(word)
return {word[j]: int(weight[0][j]) for j in range(len(word))}
# 求词频字典的总频数
def Sum(dic):
n = 0
for value in dic.values():
n = n + value
return n
def Bayes(test):
test = Filter_text(test)
test_count = sorted(Count(test).items(), key=lambda x: x[1], reverse=True)
# print(test_count)
# 提取前15个词作计算条件概率,代入贝叶斯联合公式
# 如果长度不够,就取总词数
if len(test_count) >= 15:
r = 15
else:
r = len(test_count)
# print(r)
P = []
for n in range(r):
word = test_count[n][0]
if not spam_dic.get(word):
P.append(0.4)
# 如果有的词是第一次出现,无法计算P(S | W),就假定这个值等于0.4。
# 因为垃圾邮件用的往往都是某些固定的词语,所以如果你从来没见过某个词,它多半是一个正常的词。
elif not health_dic.get(word):
word_ham = 0.003
# 这个值可能还需要修正,资料中给出的值是1%
# 如果某个词只出现在垃圾邮件中, 就假定,它在正常邮件的出现频率是0.3 %
word_spam = spam_dic[word] / spam_sum
P.append((word_spam * 0.5) / ((word_ham * 0.5) + (word_spam * 0.5)))
else:
word_spam = spam_dic[word] / spam_sum
word_ham = health_dic[word] / health_sum
P.append((word_spam * 0.5) / ((word_ham * 0.5) + (word_spam * 0.5)))
# print(P)
# 计算联合概率
p1 = 1
p2 = 1
for n in range(r):
p1 = p1 * P[n]
p2 = p2 * (1 - P[n])
return (p1 / (p1 + p2))
# 导入文件,创建 health邮件库和spam邮件库
health = ''
spam = ''
for x in range(1, 21):
f = open('E:/PY/spam_filter/email/ham/' + str(x) + '.txt', 'r', errors='ignore')
health = health + f.read() + ' '
f.close()
f = open('E:/PY/spam_filter/email/spam/' + str(x) + '.txt', 'r', errors='ignore')
spam = spam + f.read() + ' '
f.close()
health = Filter_text(health)
spam = Filter_text(spam)[1:] # spam字符串第一个是空格,不好看
# 转化为有序的字典
health_dic = dict(sorted(Count(health).items(), key=lambda x: x[1], reverse=True))
spam_dic = dict(sorted(Count(spam).items(), key=lambda x: x[1], reverse=True))
# print(spam_dic)
health_sum = Sum(health_dic)
spam_sum = Sum(spam_dic)
# 测试
for x in range(21, 26):
f = open('E:/PY/spam_filter/email/spam/' + str(x) + '.txt', 'r', errors='ignore')
test = f.read()
f.close()
print('spam' + str(x), Bayes(test))
for x in range(21, 26):
f = open('E:/PY/spam_filter/email/ham/' + str(x) + '.txt', 'r', errors='ignore')
test = f.read()
f.close()
print('ham' + str(x), Bayes(test))