-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathse.py
206 lines (196 loc) · 6.76 KB
/
se.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/python
import sys
import time
from multiprocessing import Process
from datetime import datetime
from urlparse import urlparse
import random
import logging
import requests
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch
import ssl
import hashlib
from urlparse import urlparse
import urllib3
from nltk.tokenize import word_tokenize
import re
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import redis
#----- Settings & Start object created -------#
urllib3.disable_warnings()
es = Elasticsearch()
redisObj=redis.Redis()
pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
redisObj = redis.Redis(connection_pool=pool)
domain = sys.argv[1]
datetime_str = datetime.now().strftime("%Y-%m-%d %H:%M")
clean_domain = urlparse(domain)
logging.basicConfig(filename='/root/log/se_'+clean_domain.netloc+'_'+datetime_str+'.log',level=logging.DEBUG)
print_all=1
logging_all=1
logging.debug(datetime_str+' - SE started successfully')
esIndex="searchengine1"
esType="searchEngineType1"
save=1
#----- End Settings & Start object created -------#
def getDateTime():
return datetime.now().strftime("%Y-%m-%d %H:%M")
def geturl(soup,domain,clean=0):
arr=[]
for link in soup.findAll("a"):
line_href=link.get("href")
if line_href=='#' or line_href=='/':
continue
url_c=line_href
url_p=urlparse(line_href)
if url_p.netloc != domain and url_p.netloc !='':
continue
if url_p.netloc =='':
url_c=clean_domain.netloc+'/'+line_href
if url_p.scheme =='':
url_c=clean_domain.scheme+'://'+url_c
line_href=url_c
arr.append(line_href)
return arr
def removeEmptyLines(soup):
text = soup.get_text()
return "\n".join([ll.rstrip() for ll in text.splitlines() if ll.strip()])
'''From Example 1'''
def tokenized_docs(text,raw=1):
if raw==1:
return [word_tokenize(doc) for doc in [text]]
else:
return ' '.join(tokenized_docs[0]).encode('utf8')
'''From Example 2'''
def tokenized_docs_no_punctuation(tokenized_docs,raw=1):
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
new_review = []
for token in review:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_review.append(new_token)
tokenized_docs_no_punctuation.append(new_review)
if raw==1:
return tokenized_docs_no_punctuation
else:
return ' '.join(tokenized_docs_no_punctuation[0]).encode('utf8')
'''From Example 3'''
def tokenized_docs_no_stopwords(tokenized_docs_no_punctuation,raw=1):
tokenized_docs_no_stopwords = []
for doc in tokenized_docs_no_punctuation:
new_term_vector = []
for word in doc:
if not word in stopwords.words('english'):
new_term_vector.append(word)
tokenized_docs_no_stopwords.append(new_term_vector)
if raw==1:
return tokenized_docs_no_stopwords
else:
return ' '.join(tokenized_docs_no_stopwords[0]).encode('utf8')
'''From Example 4'''
def preprocessed_docs(tokenized_docs_no_stopwords,raw=1):
porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()
preprocessed_docs = []
for doc in tokenized_docs_no_stopwords:
final_doc = []
for word in doc:
#final_doc.append(porter.stem(word))
#final_doc.append(snowball.stem(word))
final_doc.append(wordnet.lemmatize(word)) #note that lemmatize() can also takes part of speech as an argument!
#print final_doc
if raw==1:
return final_doc
else:
return ' '.join(final_doc).encode('utf8')
def ProcessKeyword(soup,raw=1):
### Start Keywords process to search ###
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
final_str = removeEmptyLines(soup)
tokenized_docs1 = tokenized_docs(final_str)
tokenized_docs_no_punctuation1 = tokenized_docs_no_punctuation(tokenized_docs1)
tokenized_docs_no_stopwords1 = tokenized_docs_no_stopwords(tokenized_docs_no_punctuation1)
preprocessed_docs1 = preprocessed_docs(tokenized_docs_no_stopwords1)
if raw==1:
return preprocessed_docs1
else:
return ' '.join(preprocessed_docs1).encode('utf8')
### End Keywords process to search ###
def worker(i):
while True:
str1=redisObj.spop(clean_domain.netloc)
if str1 == "" or str1 == "None" or str1 == None:
print 'found nothing returning from thread'+i
return
if redisObj.sismember(clean_domain.netloc+'_complete',str1) != True:
logging.info('Thread - '+str(i)+' is processing '+str1)
## Processing Start with 1st request
rand1=random.randint(1, 30)
rand2=random.randint(1, 30)
user_agent = {'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.14'+str(rand1)+'.90 Safari/537.'+str(rand2)+''}
try:
requestObj = requests.get(str1, headers = user_agent)
except:
print "request failed"+str1
continue
soup = BeautifulSoup(requestObj.text, "html.parser")
try:
link_array1 = geturl(soup,clean_domain.netloc,1)
except:
print "links adding failed"
###print link_array
ProcessKeyword1 = ProcessKeyword(soup)
try:
redisObj.sadd(clean_domain.netloc, *set(link_array1))#add links
redisObj.sadd(clean_domain.netloc+"_complete",str1)
except:
print "sadd failed"
print '-------start---'
print link_array1
print '-------end---'
continue
try:
if save==1:
uniq_md5=hashlib.md5(str1).hexdigest()
doc = {'unix_time':int(round(time.time())),'link':str1,'text':ProcessKeyword1,'timestamp': datetime.now()}
res = es.index(index=esIndex, doc_type=esType,id=uniq_md5, body=doc)
else:
print ProcessKeyword1
except:
print 'Sorry saving process failed...'+str1
return
## Processing Start with 1st request
rand1=random.randint(1, 30)
rand2=random.randint(1, 30)
user_agent = {'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.14'+str(rand1)+'.90 Safari/537.'+str(rand2)+''}
requestObj = requests.get(domain, headers = user_agent)
soup = BeautifulSoup(requestObj.text, "html.parser")
#link_array = geturl(soup)
link_array = geturl(soup,clean_domain.netloc,1)
#print link_array
ProcessKeyword1 = ProcessKeyword(soup)
redisObj.sadd(clean_domain.netloc, *set(link_array))#add links
redisObj.sadd(clean_domain.netloc+"_complete",domain)
try:
if save==1:
uniq_md5=hashlib.md5(domain).hexdigest()
doc = {'unix_time':int(round(time.time())),'link':domain,'text':ProcessKeyword1,'timestamp': datetime.now()}
res = es.index(index=esIndex, doc_type=esType,id=uniq_md5, body=doc)
else:
print ProcessKeyword1
except:
print 'Sorry saving process failed...'
for i in range(2):
t=Process(target=worker,args=(i,))
t.start()
#t.join()