-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_npcs.py
230 lines (193 loc) · 8.96 KB
/
scrape_npcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import cloudscraper
from bs4 import BeautifulSoup
import time
import csv
import os
import re
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
# Create a cloudscraper session to handle Cloudflare
scraper = cloudscraper.create_scraper()
# Base URLs
npc_base_url = "https://everythingrs.com/tools/osrs/npclist/"
item_base_url = "https://everythingrs.com/tools/osrs/itemlist/"
# Directory paths for storing images and CSV files
npc_image_dir = "storage/app/scrape/npc_images/"
item_image_dir = "storage/app/scrape/item_images/"
npc_progress_file = npc_image_dir + 'npc_scrape_progress.txt'
item_progress_file = item_image_dir + 'item_scrape_progress.txt'
npc_csv_file = npc_image_dir + 'osrs_npcs.csv'
item_csv_file = item_image_dir + 'osrs_items.csv'
# Ensure directories exist
os.makedirs(npc_image_dir, exist_ok=True)
os.makedirs(item_image_dir, exist_ok=True)
# Function to load the last saved page number for scraping
def load_last_page(progress_file):
if os.path.exists(progress_file):
with open(progress_file, 'r') as file:
return int(file.read().strip())
return 1 # Start from page 1 if no progress file exists
# Function to save the current page number for scraping
def save_progress(page_num, progress_file):
with open(progress_file, 'w') as file:
file.write(str(page_num))
# Function to verify if data is in the correct format
def verify_data(name, data_id, img_url):
if not name or not isinstance(name, str):
return False
if not isinstance(data_id, int):
return False
# Simple regex to check if the URL is valid (must start with http/https and have a domain)
url_pattern = re.compile(r'https?://[^\s]+')
if not re.match(url_pattern, img_url):
return False
return True
# Function to validate if an image URL is a valid image
def is_valid_image(image_url):
try:
response = scraper.head(image_url, timeout=10)
content_type = response.headers.get('Content-Type')
if response.status_code == 200 and 'image' in content_type:
return True
except Exception as e:
print(f"Error validating image URL {image_url}: {e}")
return False
# Function to download the image and return the filename
def download_image(image_url, save_dir, prefix, data_id):
if not is_valid_image(image_url):
print(f"Skipping invalid image: {image_url}")
return None
img_extension = os.path.splitext(image_url)[-1]
img_filename = f"{prefix}_{data_id}{img_extension}"
img_path = os.path.join(save_dir, img_filename)
try:
img_data = scraper.get(image_url).content
with open(img_path, 'wb') as img_file:
img_file.write(img_data)
return img_filename
except Exception as e:
print(f"Failed to download image {image_url}: {e}")
return None
# Function to scrape a page and return the NPC data
def scrape_npc_page(page_num):
url = npc_base_url + str(page_num)
response = scraper.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Find the NPC table
table = soup.find('table', {'class': 'table table-striped table-bordered table-list'})
if not table:
return None # Return None if no table is found (last page or invalid page)
# Extract data from the table rows
rows = table.find_all('tr')
scraped_data = []
for row in rows:
cols = row.find_all('td')
if len(cols) > 1: # Only process rows with valid data
img_tag = cols[0].find('img')['src'] if cols[0].find('img') else None
name = cols[1].text.strip()
npc_id = cols[2].text.strip()
# Ensure the npc_id is an integer before verification
try:
npc_id = int(npc_id)
except ValueError:
continue # Skip this row if the npc_id is not an integer
img_url = img_tag
# Verify data before adding it
if verify_data(name, npc_id, img_url):
img_filename = download_image(img_url, npc_image_dir, "npc", npc_id)
if img_filename: # Only append if image download was successful
scraped_data.append({
'Name': name,
'ID': npc_id,
'Image Filename': img_filename
})
return scraped_data
# Function to scrape a page and return the item data
def scrape_item_page(page_num):
url = item_base_url + str(page_num)
response = scraper.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Find the item table
table = soup.find('table', {'class': 'table table-striped table-bordered table-list'})
if not table:
return None # Return None if no table is found (last page or invalid page)
# Extract data from the table rows
rows = table.find_all('tr')
scraped_data = []
for row in rows:
cols = row.find_all('td')
if len(cols) > 1: # Only process rows with valid data
img_tag = cols[0].find('img')['src'] if cols[0].find('img') else None
name = cols[1].text.strip()
item_id = cols[2].text.strip()
# Ensure the item_id is an integer before verification
try:
item_id = int(item_id)
except ValueError:
continue # Skip this row if the item_id is not an integer
img_url = img_tag
# Verify data before adding it
if verify_data(name, item_id, img_url):
img_filename = download_image(img_url, item_image_dir, "item", item_id)
if img_filename: # Only append if image download was successful
scraped_data.append({
'Name': name,
'ID': item_id,
'Image Filename': img_filename
})
return scraped_data
# Function to scrape all NPC pages in parallel using multithreading
def scrape_all_npc_pages():
page_num = load_last_page(npc_progress_file) # Start from the last saved page
max_page_num = 100 # Set to a reasonable maximum page limit
# If CSV does not exist, create it and write headers
if not os.path.exists(npc_csv_file):
with open(npc_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Name', 'ID', 'Image Filename']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, escapechar='\\', quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(scrape_npc_page, page) for page in range(page_num, max_page_num)]
for future in as_completed(futures):
try:
data = future.result()
if data:
# Append the new data to the CSV
with open(npc_csv_file, 'a', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Name', 'ID', 'Image Filename']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, escapechar='\\', quoting=csv.QUOTE_MINIMAL)
writer.writerows(data)
save_progress(page_num, npc_progress_file)
page_num += 1
except Exception as e:
print(f"An error occurred while scraping NPC data: {e}")
print("NPC scraping completed.")
# Function to scrape all item pages in parallel using multithreading
def scrape_all_item_pages():
page_num = load_last_page(item_progress_file) # Start from the last saved page
max_page_num = 500 # Set to a reasonable maximum page limit
# If CSV does not exist, create it and write headers
if not os.path.exists(item_csv_file):
with open(item_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Name', 'ID', 'Image Filename']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, escapechar='\\', quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(scrape_item_page, page) for page in range(page_num, max_page_num)]
for future in as_completed(futures):
try:
data = future.result()
if data:
# Append the new data to the CSV
with open(item_csv_file, 'a', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Name', 'ID', 'Image Filename']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, escapechar='\\', quoting=csv.QUOTE_MINIMAL)
writer.writerows(data)
save_progress(page_num, item_progress_file)
page_num += 1
except Exception as e:
print(f"An error occurred while scraping item data: {e}")
print("Item scraping completed.")
# Start scraping NPCs and items in parallel
scrape_all_npc_pages()
scrape_all_item_pages()