-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokeniser.py
102 lines (90 loc) · 3.12 KB
/
tokeniser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from bs4 import BeautifulSoup
import requests
import nltk#for tokenising
def checkurl(s):
if "www" in s:
return s
else:
return "www."+s
def crawl(url): #fetches the code in that url
r = requests.get(url, "lxml")
data = r.text
soup = BeautifulSoup(data)
return soup
H = {}#storing urls and no. of times url appeared
dict_tokens = {}#dict:- tokens: urls and no of times the token is present
n = 0
urls = []#store all the read urls in a list
search_list=[]
def crawler(url):
global urls
global n
global dict_tokens
n = n+1
if (n <= 10) :
crawlresult = crawl(url)
for script in crawlresult(["script", "style"]):#removing unnecessary part
script.extract()
text = crawlresult.get_text()#it has only useful data which is text
lines = (line.strip() for line in text.splitlines())
#str.strip() is used to remove the extra white spaces around the string
#str.splitlines() is used to split the string having \n and append each line as an element in a list
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
# print(text)
tokens = nltk.word_tokenize(text)#list of tokens
domain = url.split('/')[2]
for token in tokens:
if token in dict_tokens.keys():
if domain in dict_tokens[token].keys():
dict_tokens[token][domain] = dict_tokens[token][domain] + 1
else:
dict_tokens[token][domain] = 1
else:
dict_tokens[token] = {}
dict_tokens[token][domain] = 1
print("The dictionary of tokens: ")
print(dict_tokens)
for link in crawlresult.find_all('a'):#all the links are with tag 'a'
webad = link.get('href')#href=web adderss
if "http" in str(webad):
parts = webad.split("/")
addr = checkurl(str(parts[2]))
if addr in H:
H[addr] = int(H[addr]) + 1
else:
H[addr] = 1
if addr in url:
pass
else:
urls.append(webad)
crawler(webad)
crawler('https://www.apple.com')
mat = {}
for key in dict_tokens.keys():
mat[key] = []
# print(key, dict_tokens[key])
for dom in urls[0:9]:
k = dom.split('/')[2]
if k in dict_tokens[key].keys():
mat[key].append(dict_tokens[key][k])
else:
mat[key].append(0)
# print("All the urls gatehered: ")
#
# print("=======================================")
#
#
# inp1=input("Enter a single charecter <x>")
# inp2=input("Enter a single charecter <y>")
# search=inp1+inp2
# print("searching for: ",search,"in all tokens")
#
# print("------------------SEARCH---------------")
# for token in dict_tokens.keys():
# if search in token:
# search_list.append(token)
#
# print("The list of words containing the entered charecters by user: ")
# print(search_list)
# print("")