-
Notifications
You must be signed in to change notification settings - Fork 0
/
CommunityImages.py
218 lines (171 loc) · 6.66 KB
/
CommunityImages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#---------------------------------------------------------------------------------------------------------------------
#
# GENERATE LIST OF IMAGE URL FOR TOP100GAMES
# CREATES DIRECTORY WITH FOLDERS FOR EACH GAME CONTAING
# A CSV OF IMAGE URLS AND A FOLDER OF DOWNLOADED IMAGES
#
#---------------------------------------------------------------------------------------------------------------------
import multiprocessing as mp
from lxml import html
from lxml import etree
import requests
import os
import Top100Games
import pandas as pd
import argparse
import socket
# Define a timeout variable for Requests
timeout = 0.5 # This is a 1 second timeout
socket.setdefaulttimeout(timeout)
# A function that downloads urls from a game's community page
def get_urls(appID,Num_scrolls,UrlDir):
"""
Creates a csv file with the url's of images on the appID's community page.
Inputs:
appID: int, steam appID of game.
Num_scrolls: int, Number of times to scroll through the community page.
UrlDir: str, the directory to save the image urls
Returns:
None
"""
game = str(appID)
# Make a directory for each game to hold a text file of the urls
game_url_path = UrlDir+'/'+game
#Check if the game has a directory
if not os.path.isdir(game_url_path):
os.makedirs(game_url_path)
# See if the csv has been made else make it
if os.path.exists(game_url_path+'/'+game+'_urls.csv'):
game_url_df = pd.read_csv(game_url_path+'/'+game+'_urls.csv', index_col = False)
else:
game_url_df = {'URL':[],'DOWNLOADED':[]}
game_url_df = pd.DataFrame(game_url_df)
# Count number of URL's in un-updated database
num_prev_url = game_url_df.shape[0]
# Scrape through with each scroll
game_urls = []
for scroll in range(1,Num_scrolls):
pg = str(scroll)
# Build the community url
game_com = 'https://steamcommunity.com/app/'+game+'/homecontent/'
url_params = {'userreviewsoffset':'0',
'p': pg,
'workshopitemspage':pg,
'readytouseitemspage':pg,
'mtxitemspage':pg,
'itemspage':pg,
'screenshotspage':pg,
'videospage':pg,
'artpage':pg,
'allguidepage':pg,
'webguidepage':pg,
'integratedguidepage':pg,
'discussionspage':pg,
'numperpage':str(50),
'browsefilter':'mostrecent',
'browsefilter':'mostrecent',
'appid':game,
'appHubSubSection':'2',
'appHubSubSection':'2',
'l':'english',
'filterLanguage':'default',
'searchText':'',
'forceanon':'1'}
#Knock on page to see if it is there
knock = 0
knocks = 4
found = False
while knock < knocks and not(found):
if scroll % 5 == 0:
print('%s: Requesting page %d'%(game, scroll))
try:
# Request the page
game_pg = requests.get(game_com,params=url_params)
# See if the there is conent
# Sometimes no exception occurs but zero content is retrieved
content = len(game_pg.content)
if content > 0:
found = True
else:
print('%s: knocking' %game)
knock+=1
except requests.exceptions.Timeout:
print('%s: knocking' %game)
knock+=1
continue
except requests.exceptions.ConnectionError:
print('%s: knocking' %game)
print(requests.exceptions.Timeout)
knock+=1
continue
# Scrape for the sources of the images displayed.
# Only if content was found
if found:
game_tree = html.fromstring(game_pg.content)
new_urls = game_tree.xpath('//img[@class = "apphub_CardContentPreviewImage"]/@src')
else:
new_urls = []
game_urls += new_urls
if knock >= knocks :
print('%s: Community not Found'%game)
game_img_urls = ['Community Not Found']
# put the new urls together and get rid of repeats
game_urls = list(set(game_urls))
print('%s: Found %d urls'%(game,len(game_urls)))
found_urls_df =pd.DataFrame( {'URL': game_urls, 'DOWNLOADED': [0]*len(game_urls)})
# append the new data
game_url_df = game_url_df.append(found_urls_df)
#Drop Duplicates
game_url_df = game_url_df.drop_duplicates(subset = 'URL')
new_df_size = game_url_df.shape[0]
num_new_url = new_df_size - num_prev_url
print(game + ': ' + str(num_new_url) + ' new URLs added to database')
print(game+': Saving URLs')
game_url_df.to_csv(game_url_path+'/'+game+'_urls.csv',index = False)
# Make an unpacked version
def get_urls_unpack(tag):
"""tag: list or tuple containing [appID,Num_scrolls,UrlDir]"""
appID,Num_scrolls,UrlDir = tag
get_urls(appID,Num_scrolls,UrlDir)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Input Arguments
parser.add_argument('--num-games',
help='The number of games to download url lists for',
default=98,
type=int
)
parser.add_argument('--num-scrolls',
help='The number of scrolls through the community pages. Each scroll is roughly 25-50 images',
default=18,
type=int
)
parser.add_argument('--num-cores',
help='The number of cores to use.',
default=mp.cpu_count(),
type=int
)
parser.add_argument('--save-dir',
help='The name of file directory to store the url lists',
default='Gameurls',
type=str
)
args = parser.parse_args()
#else:
# Get the top games list
if not os.path.exists('Top100Games.csv'):
print('Creating Top 100 List...')
Top100Games.main()
Game_df = pd.read_csv('Top100Games.csv')
# Make the Gameurl dir
if not os.path.isdir(args.save_dir):
os.makedirs(args.save_dir)
# Get the appIDs
appIDs = list(Game_df['STEAM ID'])[:args.num_games]
# Package with other arguments
for i in range(args.num_games):
appIDs[i] = [appIDs[i],args.num_scrolls,args.save_dir]
with mp.Pool(processes = args.num_cores) as pool:
pool.map(get_urls_unpack,appIDs)
pool.close()
pool.join()