-
Notifications
You must be signed in to change notification settings - Fork 2
/
podsnatch.py
executable file
·193 lines (141 loc) · 5.31 KB
/
podsnatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#! /usr/bin/env python
from lxml import etree as xml
from tqdm import tqdm
import feedparser
import requests
import argparse
import signal
import time
import sys
import re
import os
TMP_EXT = '.part'
class Show:
def __init__(self, outline_element):
self.url = (outline_element.get('xmlUrl') or
outline_element.get('xmlurl') or
None)
self.title = (outline_element.get('title') or
outline_element.get('text')[0:50] or
self.url.split('/')[-1])
self.episode_guids = []
def __str__(self):
return f'{self.title}: {self.url}'
def get_dir_name(self):
return re.sub(r'[\W]+', '_', self.title)
class Episode:
def __init__(self, item, show):
self.guid = item.id if 'id' in item else ''
self.title = item.title if 'title' in item else ''
self.link = item.link if 'link' in item else ''
self.description = item.summary if 'summary' in item else ''
self.content = item.content[0].value if 'content' in item else ''
self.number = item.itunes_episode if 'itunes_episode' in item else ''
self.url = item.enclosures[0].href if 'enclosures' in item and item.enclosures else ''
self.date = item.published_parsed if 'published_parsed' in item else ''
self.show = show
def __str__(self):
return f"""{self.title}
{self.number}
{self.guid}
{self.date}
{self.link}
{self.url}
{self.content if self.content else self.description}
{self.description}"""
def get_file_name(self):
url_tail = self.url.split('/')[-1].split('?')[0]
show_title = re.sub(r'[\W]+', '_', self.show.title)
ep_title = re.sub(r'[\W]+', '_', self.title)
formatted_date = time.strftime('%Y_%m_%d', self.date)
name_tokens = [formatted_date, self.number, ep_title, url_tail]
return '_'.join([s for s in name_tokens if s is not ''])
def parse_ompl(ompl_path):
tree = xml.parse(ompl_path)
root = tree.getroot()
shows = root.findall('./body/outline')
return [Show(x) for x in shows]
def download(url, path, mode):
# https://stackoverflow.com/a/37573701
response = requests.get(url, stream=True)
total_size = int(response.headers.get('content-length', 0))
block_size = 1024
downloaded_size = 0
t = tqdm(total=total_size, unit='iB', unit_scale=True)
with open(path, mode) as f:
for data in response.iter_content(block_size):
t.update(len(data))
f.write(data)
downloaded_size += len(data)
t.close()
if total_size != 0 and t.n != total_size:
print("ERROR downloading file")
return downloaded_size
total_downloaded_size = 0
total_downloaded = 0
full_path = ''
def convert_to_size(size):
"""
Takes a number of bytes and converts it to a string that is a human readable size.
"""
size_labels = ['B','KB','MB','GB','TB', 'PB', 'EB', 'ZB', 'YB']
converted_size = size
counter = 0
while converted_size > 1000:
converted_size /= 1000
counter += 1
size_str = f'{converted_size:.2f}{size_labels[counter]}'
return size_str
def save_podcasts(opml, output, episode_count=None):
global total_downloaded_size
global total_downloaded
global full_path
shows = parse_ompl(opml)
for show in shows:
print(f'Processing show {show.title}')
feed = feedparser.parse(show.url)
show_path = os.path.join(output, show.get_dir_name())
os.makedirs(show_path, exist_ok=True)
cnt_eps_to_dl = (int(episode_count, 10)
if episode_count is not None
else len(feed.entries))
i = 0
show_downloaded = 0
while show_downloaded < cnt_eps_to_dl and i < len(feed.entries):
item = feed.entries[i]
episode = Episode(item, show)
print(f'Processing episode {episode.title}')
full_path = os.path.join(show_path, episode.get_file_name())
print(full_path)
if not os.path.exists(full_path) and episode.url:
print('Downloading episode')
total_downloaded_size += download(episode.url, full_path + TMP_EXT, 'wb')
os.rename(full_path + TMP_EXT, full_path)
handle = open(full_path + ".txt", "w")
handle.write(str(episode))
handle.close()
show_downloaded += 1
total_downloaded += 1
else:
print('Episode already downloaded!')
i += 1
print(f'{total_downloaded} episode(s) totaling {convert_to_size(total_downloaded_size)} downloaded')
def ctrl_c_handler(signum, frame):
print('Stopping...')
if os.path.exists(full_path + TMP_EXT):
os.remove(full_path + TMP_EXT)
print(f'{total_downloaded} episode(s) totaling {convert_to_size(total_downloaded_size)} downloaded')
sys.exit(1)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Download podcasts.')
parser.add_argument('--opml', '-i', dest='opml_loc', action='store',
required=True, help='path to opml file to import')
parser.add_argument('--output-dir', '-o', dest='output_loc', action='store',
required=False, default='.',
help='location to save podcasts')
parser.add_argument('--number-of-episodes', '-n', dest='ep_cnt',
action='store', default=None,
help='how many episodes to download. By default - download all')
args = parser.parse_args()
signal.signal(signal.SIGINT, ctrl_c_handler)
save_podcasts(args.opml_loc, args.output_loc, args.ep_cnt)