-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCrawlentireinternet.py
33 lines (29 loc) · 1.19 KB
/
Crawlentireinternet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import request
from bs4 import BeautifulSoup #this package imports all the data from the website
def trade_spider(max_pages):
page=1
while page < max_pages:
url= "https://buckyroom.org/trade/search.php?page="+str(page)
source_code=request.get(url)
#all the page source is now stored in the source_code variable
plain_text=source_code.text
soup=BeautifulSoup(plain_text)#creating a beautiful soup object
#go to page source to find all the links for specific articles to browse in a class
for link in soup.findAll('a',{'class': 'item-name'}): #getting all the titles
href= "http://buckyroom.org/"+link.get(href)#we want only the data in the href
title=link.string()
#print(href)
#print(title)
page += 1
def get_single_item_data(item_url):
source_code=request.get(item_url)
plain_text=source_code.text
soup=BeautifulSoup(plain_text)
#for item_name in soup.findAll('a', {'class': 'item-name'}):
for item_name in soup.findAll('div', {'class': 'i-name'}):
print(item-name.string)
#how to crawl a page that is a link on the page you already crawled
for link in soup.findAll('a'):
href= "http://www.buckysroom.org" + link.get('href')
print(href)
trade_spider(3)#crawling 3 pages