-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcard_metadata.py
116 lines (94 loc) · 4.51 KB
/
card_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
File: card_metadata.py
Creating a card database that contains card creatures meta-information (card_id and URL) and storing in a file.
"""
import time
import re
import coloredlogs
import logging
import csv
import argparse
import os
import selenium.common.exceptions
from selenium.webdriver.common.by import By
from gatherer_croupier.driver import initialize_driver
from gatherer_croupier.utils import get_id_string
from gatherer_croupier.config import CARD_DIRECTORY_PATH
# Create logger
logger = logging.getLogger(__name__)
coloredlogs.install(level='INFO')
# Initialize the parser
parser = argparse.ArgumentParser()
parser.add_argument("creature_type", help="Enter the creature type")
args = parser.parse_args()
# Check if the file exists
check_path = os.path.isfile(CARD_DIRECTORY_PATH)
if check_path:
logger.info('[STATUS] card_database.csv exists')
else:
with open(CARD_DIRECTORY_PATH, 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(['id', 'type', 'subtype', 'url'])
csvFile.close()
logger.info('[STATUS] card_database.csv successfully created')
if __name__ == '__main__':
# Initialize the browser object
driver = initialize_driver(headless=True, install=False)
# URL to go to. Include f-string sys.argv argument creature
url = f'https://gatherer.wizards.com/Pages/Search/Default.aspx?page=0&action=advanced&type=+["Creature"]&subtype=+["{args.creature_type}"]'
# Opens the URL in the browser
driver.get(url)
logger.info("[STATUS] URL successfully open...")
MAX_ID = 100
logger.info("How many pages are from that type of card?")
xpath_pages = "/html/body/form/div[5]/div/div[1]/div[2]/div/div[7]/div"
try:
num_pages = driver.find_element(By.XPATH, xpath_pages)
num_pages = re.sub('>|<', '', re.findall(r'>\d+<', num_pages.get_attribute('innerHTML'))[-1])
except selenium.common.exceptions.NoSuchElementException:
num_pages = '1'
logger.info(f"[START] There are {num_pages} pages from this type of card to download...")
num_pages = int(num_pages)
visit_card_ids = set()
for p in range(num_pages):
# We are currently on page 0, just get the page if p!=0
if p != 0:
logger.info(f"[ACTION] Change to the page {p + 1}")
url = f'https://gatherer.wizards.com/Pages/Search/Default.aspx?page={p}&action=advanced&type=+["Creature"]&subtype=+["{args.creature_type}"]'
driver.get(url)
logger.info(f"[STATUS] URL from page {p + 1} successfully open...")
for i in range(0, MAX_ID):
logger.info(f"[ACTION] Extract element #{i} with class='cardTitle' from the current page")
element_id = get_id_string(i)
xpath_string = f'//*[@id="ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl00_listRepeater_ctl{element_id}_cardTitle"] '
card_info = driver.find_elements(By.XPATH, xpath_string)
# Get href from the element extracted
card_url = ''.join([elem.get_attribute('href') for elem in card_info]).replace(',', '')
if card_url == '':
logger.info(f"[STATUS] There are no more cards in the page {p + 1}...")
break
logger.info(f'[STATUS] Url correctly extracted: {card_url}')
logger.info(card_url)
# Get card id from href
card_id = re.findall(r'\d+', card_url)[0]
# Check if the card_id was not explored
logger.info(f"Check if card-id {card_id} was not explored...")
if card_id in visit_card_ids:
logger.info(f"The card-id {card_id} was already explored...skip this card!!")
continue
else:
logger.info(f"[EXCEPTION] The card-id {card_id} was not explored, proceed to store info in the csv file")
visit_card_ids.add(card_id)
logger.info('[WRITE] Add the information on card_database.csv')
with open(CARD_DIRECTORY_PATH, 'a') as csvFile:
writer = csv.writer(csvFile)
writer.writerow([card_id, "Creature", args.creature_type, card_url])
csvFile.close()
logger.info("[STATUS] Information correctly added...")
# Close the browser
logger.info(f"[STATUS] Page {p + 1} successfully scrapped")
# Wait 1 seconds before go to the next page
logger.info("[TIME] Wait one second before go to the next page...")
time.sleep(1)
logger.info("[STATUS] All info was scrapped successfully!")
driver.quit()