-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFCAIA.py
executable file
·157 lines (122 loc) · 4.68 KB
/
FCAIA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
#The lybraries required for this code: The first three are downloaders, the fourth for the file io, 5 & 6 are handleers and sys is for the command line input.
import os
import re
import sys
import time
import shutil
import urllib
import string
import os.path
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
#In 5.2, I added a UserAgent to hopefully combat the rate limiting.
def getPageSoup(pageurl):
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/81.0.4044.138 Chrome/81.0.4044.138 Safari/537.36"
req = urllib.request.Request(pageurl, headers={'User-Agent': user_agent})
content = urllib.request.urlopen(req).read()
soup = (BeautifulSoup(content, "lxml"))
return soup
#From now on, all the values will be stored in arrays. At some point it's going to dump to a log
#file, and this will help.
def getNameNumUrls(soup):
ltm_a = []
ltm_b = []
ltm_c = []
for link in soup.find_all('div', {'class': 'fileText'}):
ltm_a.append(link.a['href'])
try:
ltm_b.append(link.a['title'])
except AttributeError: # element does not have .name attribute
ltm_b.append(link.a.text)
except KeyError: # element does not have a class
ltm_b.append(link.a.text)
fn = (re.findall(r'\d{5,20}', (link.a['href'])))[0]
ltm_c.append(fn)
return ltm_b, ltm_c, ltm_a
#A filecount function.
def filecount(DIR):
return len([name for name in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, name))])
#The completed fucnction for constructing 4chan folders.
def makeChanDir(soup):
# name = (soup.title.string).strip()
nameA = (soup.title.string)
name = nameA.strip()
boardOld,threadRaw,dis,chan = name.split(" - ")
board = boardOld.replace("/", " ").upper()
alphabet = string.ascii_letters + string.digits + ("'[];=+~()#&"",.!-_ ")
thread = ''
for char in threadRaw:
if char in alphabet:
thread+=char
# newName = ("Site-[ 4chan ] - Board-[" + board + "] - Thread-[ " + thread +" ]")
newName = ("Site-[ 4chan ] - Board-[ {} ] - Thread-[ {} ]".format(board, thread))
return newName
#Returns a list of all of the arguments.
def getAllArgs(howOut="return"):
allArgs = sys.argv
allArgs = allArgs[1:]
if howOut == "print":
print (len(allArgs))
for i in allArgs: print (i)
elif howOut == "return":
return allArgs
#Main program function, downloads images and puts them into folders.
def everything(url):
soup = getPageSoup(url)
newName = makeChanDir(soup)
dcwd = ((os.getcwd()) + "/" + newName)
if os.path.exists(dcwd):
fileInt = filecount(dcwd)
else:
fileInt = 0
os.makedirs(dcwd)
fileName, fileNumber, fileURL = getNameNumUrls(soup)
#Sets up the numbering, even if the program restarts.
threadCount = len(fileName)
if fileInt == 0:
loopCount = (fileInt)
newStart = True
elif fileInt > 3:
loopCount = (fileInt - 1)
newStart = False
#The already downloaded Images
print("Folder Name: " + str(newName))
print("AD Images: " + str(fileInt))
#And the new downloader, as well:
while loopCount < threadCount:
print("Image: " + (str(loopCount + 1)) + "/" + (str(threadCount)))
fileNameTemp = fileName[loopCount]
fileNumberTemp = fileNumber[loopCount]
fileURLTemp = fileURL[loopCount]
print("File Name: " + fileNameTemp)
print("File Number: " + fileNumberTemp)
print("File Url: " + fileURLTemp)
extTR = os.path.splitext(fileNameTemp)[1]
fileNameFinal = (fileNumberTemp + "___ON[" + fileNameTemp + "]" + extTR)
print ("File: " + fileNameFinal)
try:
response = requests.get("https:" + fileURLTemp, stream=1)
with open((os.path.join(dcwd, fileNameFinal)), 'wb') as out_file:
print ("Downloading: " + fileNameFinal)
shutil.copyfileobj(response.raw, out_file)
del response
print ("Downloaded: " + fileNameFinal + "\n")
except OSError:
print ("Error on image: {}\n".format(fileNameFinal))
except urllib.error.URLError:
print ("Network Error on image: {}\n".format(fileNameFinal))
except Exception:
print ("Exception on image: {}\n".format(fileNameFinal))
loopCount += 1
return newName, threadCount
#Main function. Loops over all passed thread urls.
def main():
r = getAllArgs()
for n, i in enumerate(r):
print ("Page {}: {}".format(n, i))
e,t = everything(i)
print ("{} files in: '{}'".format(t,e))
if __name__ == "__main__":
main()