-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_processor.py
116 lines (101 loc) · 3.68 KB
/
text_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import re,string
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
# nltk.download('stopwords')
import nltk
regex_str = [
r'<[^>]+>',
r'(?:@[\w_]+)',
r"(?:\#+[\w_]+[\w_\-]*[\w_]+)",
r'(?:(?:\d+,?)+(?:\.?\d+)?)',
r"(?:[a-z][a-z'\-_]+[a-z])",
r'(?:[\w_]+)',
]
token_pattern = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)
def handle_emojis(tweet):
# Smile -- :), : ), :-), (:, ( :, (-:, :')
tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
# Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
# Love -- <3, :*
tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
# Wink -- ;-), ;), ;-D, ;D, (;, (-;
tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
# Sad -- :-(, : (, :(, ):, )-:
tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
# Cry -- :,(, :'(, :"(
tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
return tweet
def preprocess_tweet(tweet):
# Convert to lower case
tweet = tweet.lower()
# Replaces URLs with the word URL
tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', '', tweet)
# Replace @handle with the word USER_MENTION
tweet = re.sub(r'@[\S]+', '', tweet)
# Replaces #hashtag with hashtag
tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
# Remove RT (retweet)
tweet = re.sub(r'\brt\b', '', tweet)
# Replace 2+ dots with space
tweet = re.sub(r'\.{2,}', ' ', tweet)
# Strip space, " and ' from tweet
tweet = tweet.strip(' "\'')
# Replace emojis with either EMO_POS or EMO_NEG
tweet = handle_emojis(tweet)
# Replace multiple spaces with a single space
tweet = re.sub(r'\s+', ' ', tweet)
return tweet
#this function is required by nltk_analyzer_naive_bayes_model
def remove_noise(tweet_tokens, stop_words=()):
cleaned_tokens = []
for token, tag in pos_tag(tweet_tokens):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|' \
'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
token = re.sub("(@[A-Za-z0-9_]+)", "", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
return cleaned_tokens
# 获取单词的词性
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return None
def Lemmatizer(s):
tagged_sent = pos_tag(s) # 获取单词词性
wnl = WordNetLemmatizer()
lemmas_sent = []
for tag in tagged_sent:
wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原
return lemmas_sent
def removeStopWord(str):
result = [s.lower() for s in str if not s.lower() in stopwords.words()]
return result
def tokenize(tokens_re, s):
return tokens_re.findall(s)
def processText(tweet):
tweet = preprocess_tweet(tweet)
words = tokenize(token_pattern, tweet) # cut sentence to list
words = removeStopWord(words)
words = Lemmatizer(words)
# print("s2: ", len(sentense), sentense)
return words