-
Notifications
You must be signed in to change notification settings - Fork 0
/
modeltest.py
73 lines (53 loc) · 1.83 KB
/
modeltest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import time
import requests
from stem import Signal
from stem.control import Controller
from bs4 import BeautifulSoup
import os
import csv
import pandas as pd
filename = 'DrugNames.csv'
First_Row = ['Drug names']
Row = []
Rows = []
num_links_to_crawl = 400
# Set the user agent to use for the request
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
# Set the headers for the request
headers = {'User-Agent': user_agent}
count=0
# Initialize the controller for the Tor network
with Controller.from_port(port=9051) as controller:
# Set the controller password
controller.authenticate(password='CristianoRonaldoCR7')
# Set the starting URL
url = 'http://iwggpyxn6qv3b2twpwtyhi2sfvgnby2albbcotcysd5f7obrlwbdbkyd.onion/'
# Initialize the visited set and the link queue
visited = set()
queue = [url]
# Get the list of keywords to search for
# keywords = input('Enter a list of keywords to search for, separated by commas: ').split(',')
# Crawl the links
while queue:
# Get the next link in the queue
link = queue.pop(0)
# Skip the link if it has already been visited
if link in visited:
continue
# Set the new IP address
controller.signal(Signal.NEWNYM)
# Send the request to the URL
response = requests.get(link, headers=headers)
# Parse the response
soup = BeautifulSoup(response.text, 'html.parser')
# Find all title on the page
title = soup.find_all('title')
print(title)
dfs = pd.read_html(url)
print(dfs)
data = pd.DataFrame(Rows,columns=First_Row)
data.to_csv('DrugNames.csv',index=False)
# Print the visited links
print('Visited links:')
for link in visited:
print(link)