-
Notifications
You must be signed in to change notification settings - Fork 34
/
yt-feed-to-email
executable file
·408 lines (337 loc) · 15.4 KB
/
yt-feed-to-email
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#!/usr/bin/env python
import itertools as it, functools as ft, operator as op
import pathlib as pl, datetime as dt, urllib.parse as up, hashlib as hl, subprocess as sp
import xml.etree.ElementTree as etree
import os, sys, re, logging, time, calendar, base64, json, textwrap, unicodedata
import feedparser as fp # pip install --user feedparser
import http.client as hc, urllib.error as ue # feedparser fetches can raise those errors
class LogMessage:
def __init__(self, fmt, a, k): self.fmt, self.a, self.k = fmt, a, k
def __str__(self): return self.fmt.format(*self.a, **self.k) if self.a or self.k else self.fmt
class LogStyleAdapter(logging.LoggerAdapter):
def __init__(self, logger, extra=None): super().__init__(logger, extra or dict())
def log(self, level, msg, *args, **kws):
if not self.isEnabledFor(level): return
log_kws = {} if 'exc_info' not in kws else dict(exc_info=kws.pop('exc_info'))
msg, kws = self.process(msg, kws)
self.logger._log(level, LogMessage(msg, args, kws), (), **log_kws)
err_fmt = lambda err: '[{}] {}'.format(err.__class__.__name__, err)
get_logger = lambda name: LogStyleAdapter(logging.getLogger(f'ytf2e.{name}'))
str_norm = lambda v: unicodedata.normalize('NFKC', v.strip()).casefold()
str_hash = lambda p: base64.urlsafe_b64encode(
hl.blake2s(str(p).encode(), person=b'ytf2e.s1').digest() ).decode()[:12]
def tuple_hash(*data):
if len(data) == 1 and isinstance(data[0], (tuple, list)): data = data[0]
src = list()
for v in data:
if v is None: src.append('\ue003')
elif isinstance(v, (int, str, dt.tzinfo)): src.append(str(v))
elif isinstance(v, (tuple, list)): src.append(tuple_hash(v))
elif isinstance(v, dt.datetime):
src.append(conv_ts_utc(v).strftime('%Y-%m-%dT%H:%M:%S'))
elif isinstance(v, dt.timedelta): src.append('\ue002{v.total_seconds()}')
elif isinstance(v, set): src.append(tuple_hash(sorted(v)))
else: raise ValueError(type(v), v)
return str_hash('\ue000'.join(
'\ue001{}\ue001'.format(v.replace('\ue001', '\ue001'*2)) for v in src ))
def str_repr(s, max_len=160, len_bytes=False, ext=' ...[{s_len}]'):
if isinstance(s, bytes): s = s.decode('utf-8', 'replace')
if not isinstance(s, str): s = str(s)
s_len, s_repr, ext_tpl = f'{len(s):,d}', repr(s)[1:-1], ext.format(s_len='12/345')
s_repr = s_repr.replace("\\'", "'").replace('\\"', '"')
if max_len > 0 and len(s_repr) > max_len:
s_len = f'{max_len}/{s_len}'
if not len_bytes: s_repr = s_repr[:max_len - len(ext_tpl)] + ext.format(s_len=s_len)
else:
n = max_len - len(ext_tpl.encode())
s_repr = s_repr.encode()[:n].decode(errors='ignore') + ext.format(s_len=s_len)
return s_repr
dd = lambda text: (textwrap.dedent(text).strip('\n') + '\n').replace('\t', ' ')
fill = lambda s,w=90,ind='',ind_next=' ',**k: textwrap.fill(
s, w, initial_indent=ind, subsequent_indent=ind if ind_next is None else ind_next, **k )
class YTFeed:
title = chan = url = title_filter = None
ts_last_check = 0
delay_ewma = 24 * 3600
delay_ewma_max = 30 * 24 * 3600
delay_ewma_a = 0.3
etag = seen_entries = None
def __init__(self, url, title=None, title_filter=None, chan=None):
if not chan: chan, = up.parse_qs(up.urlparse(url).query)['channel_id']
if not title: title = f'chan.{chan}'
title = ' '.join(title.replace('\n', ' ').split())
if len(title) > 40: title = f'{title[:38]}--'
self.title, self.chan, self.url, self.title_filter = title, chan, url, title_filter
def __repr__(self):
chan = self.chan
if self.title_filter: chan = f'{chan} || {self.title_filter}'
f'YTFeed({self.title} [{chan}])'
@classmethod
def from_xml(cls, attrs):
return cls(attrs['xmlUrl'], attrs.get('title'))
@classmethod
def from_line(cls, line):
title_filter, line = None, line.strip().split(None, 1)
url, title = line if len(line) > 1 else (line[0], None)
if '||' in title:
title, title_filter = map(str.strip, title.split('||', 1))
title_filter = re.compile(title_filter)
return cls(url, title, title_filter)
def ts_check_next(self):
# /2 to run 2x fetches per (average-ish) interval between entries
return self.ts_last_check + self.delay_ewma / 2
def entry_id(self, e):
e_id = None
for k in 'id', 'yt_videoid', 'title', 'link', 'published', 'modified', 'created':
if e_id := e.get(k): break
if e_id: e_id = tuple_hash('id.1', e_id)
return e_id
def filter_check(self, title):
if not self.title_filter: return True
return bool(self.title_filter.search(title))
class YTFeedIndex:
def __init__(self, *feed_idxs):
self.idx = dict()
if feed_idxs:
for idx in feed_idxs: self.add(idx)
def add(self, feeds):
if isinstance(feeds, YTFeed): self.idx[feeds.chan] = feeds
elif isinstance(feeds, YTFeedIndex): self.idx.update(feeds.idx)
else:
for feed in feeds: self.idx[feed.chan] = feed
def get(self, chan, fallback=None):
return self.idx.get(chan, fallback)
__bool__ = lambda s: bool(s.idx)
__contains__ = lambda s,f: f.chan in s.idx
__len__ = lambda s: len(s.idx)
__iter__ = lambda s: iter(s.idx.values())
# Line format: {ts} :: {chan-id} {chan-name!r} :: {update-json}
state_log_name = 'updates.log'
state_log_max_size = 3 * 2**20
def state_process(state_dir, feeds):
if not state_dir: return
state_dir, log = pl.Path(state_dir), get_logger('state')
if not state_dir.exists():
state_dir.mkdir(mode=0o700, parents=True, exists_ok=True)
state_last, state_log = dict(), state_dir / state_log_name
with state_log.open('a+') as src:
src.seek(0)
for line in src:
try: update = json.loads(line.split(' :: ', 2)[-1])
except Exception as err:
log.error('Failed to process feed-state entry: {} -- {!r}', err_fmt(err), line)
continue
feed = feeds.get(update['chan'])
if not feed:
log.debug('Dropping update(s) for nx feed: {}', str_repr(line))
continue
state_last[feed.chan] = line, feed, update
state_log_new = None
if state_log.stat().st_size > state_log_max_size:
state_log.rename(state_dir / f'{state_log_name}.old')
state_log_new = state_log.open('a')
try:
for line, feed, update in state_last.values():
for k in 'ts_last_check etag seen_entries delay_ewma'.split():
try: setattr(feed, k, update[k])
except KeyError:
if k != 'seen_entries': raise
if state_log_new: state_log_new.write(line)
finally:
if state_log_new: state_log_new.close()
return state_log
def state_update(state_log, ts, feed, update):
ts = time.strftime('%Y-%m-%d %H:%M', time.localtime(ts))
update = json.dumps(update)
with state_log.open('a') as dst:
dst.write(f'{ts} :: {feed.chan} {feed.title!r} :: {update}\n')
class FeedFetchError(Exception): pass
feed_user_agent = f'yt-feed-to-email/0.1 feedparser/{fp.__version__}'
feed_accept_header = ( 'application/atom+xml,application/rdf+xml,'
'application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2' )
def feed_fetch(log, url, etag):
try:
feed = fp.parse( url, agent=feed_user_agent,
etag=etag, request_headers=dict(Accept=feed_accept_header) )
status, bozo, bozo_err = (
feed.get(k) for k in ['status', 'bozo', 'bozo_exception'] )
except (hc.HTTPException, ue.URLError) as err:
status, bozo, bozo_err = 0, True, err_fmt(err)
if (not status and bozo) or (status or 1000) >= 400:
raise FeedFetchError(f'feedparser error (status={status}): {url} - {bozo_err}')
if status == 304: return
elif status >= 300: raise FeedFetchError(f'Unhandled 3xx response status {status}')
return feed.get('etag'), feed.entries
def feed_process(log, feed, entries, ts, ts_since=None):
first_old_id = e_ts_prev = None
ewma_delay, ewma_a = feed.delay_ewma, feed.delay_ewma_a
emails, feed_ids, seen_ids = list(), dict(), feed.seen_entries or dict()
for e in entries:
if not feed.filter_check(e.title): continue
e_id, e_ts = feed.entry_id(e), calendar.timegm(e.published_parsed)
if not e_id: raise ValueError(f'Failed to get id for feed entry: {str_repr(e)}')
feed_ids[e_id] = e_ts
if not first_old_id: # skips all older entries in ewma calculation
if e_ts_prev: ewma_delay = ewma_a * (e_ts - e_ts_prev) + (1 - ewma_a) * ewma_delay
e_ts_prev = e_ts
if ts_since is not None and e_ts <= ts_since:
log.debug( 'Skipping old entry [id={} ts={}]: pub={}'
' author={} title={!r}', e_id, e_ts, e.published, e.author, e.title )
if not first_old_id: first_old_id = e_id
continue
if e_id in seen_ids:
log.debug( 'Skipping already-seen entry [id={} ts={}]: pub={}'
' author={} title={!r}', e_id, e_ts, e.published, e.author, e.title )
if not first_old_id: first_old_id = e_id
continue
log.debug( 'Generating notification for entry [{}]:'
' ts={} author={!r} title={!r}', e_ts, e.published, e.author, e.title )
body = dd(f'''
Title: {e.title}
Author: {e.author}
Published: {e.published}
Link: {e.link}''')
if summary := e.get('summary'):
body += '\nSummary:\n' + process_summary(summary)
emails.append((e_ts, f'YT [{e.author}]: {e.title}', body))
if e_ts_prev and not emails and (ts - e_ts_prev) > ewma_delay:
# This makes empty checks bump delay up
ewma_delay = ewma_a * (ts - e_ts_prev) + (1 - ewma_a) * ewma_delay
ewma_delay = min(feed.delay_ewma_max, ewma_delay)
elif not e_ts_prev: log.warning('Empty feed - check/remove it from list') # can be a bug too
# Find oldest common e_id and merge all newer seen_ids into feed_ids
# This is done to avoid notifications for flapping visible/hidden entries
n = 0
for n, e_id in enumerate(reversed(list(seen_ids))):
if e_id in feed_ids: break
if n > 0: seen_ids = dict((e_id, seen_ids[e_id]) for e_id in list(seen_ids)[:-n])
feed_ids = dict(sorted( # (e_id, ts) with later ts taking prio for same id
it.chain(seen_ids.items(), feed_ids.items()), key=op.itemgetter(1) ))
feed_ids = dict(sorted(feed_ids.items(), key=op.itemgetter(1), reverse=True))
return emails, ewma_delay, feed_ids
def process_summary(text, w=120, pre=' '):
line_last, text = None, text.strip().split('\n')
for n, line in enumerate(text):
line = line.rstrip()
if line == line_last:
text[n] = ''
continue
line_last = line
text[n] = fill(text[n], w) + '\n'
return ''.join(f'{pre}{line}' for line in filter(None, text))
def main(args=None):
import argparse
parser = argparse.ArgumentParser(
description='Script to generate email notifications for YouTube OPML/RSS feed updates.')
group = parser.add_argument_group('Feed sources')
group.add_argument('-o', '--opml', metavar='file',
help='YT OPML export from https://www.youtube.com/subscription_manager?action_takeout=1 link.')
group.add_argument('-r', '--rss-list', metavar='file',
help='File with YT RSS/Atom feed URLs, one per line, with optional comments after URLs.')
group.add_argument('-c', '--opml-convert', action='store_true',
help='Append all new feeds from specified -o/--opml file to -r/--rss-list and exit.')
group = parser.add_argument_group('Notification options')
group.add_argument('-e', '--email', metavar='addr', required=True,
help='Email address to send video notifications to via local "mail" command.')
group.add_argument('-d', '--email-delay',
type=float, metavar='float', default=1.1,
help='Delay between running notification-command in seconds.'
' Useful to make sure emails sort by date/time correctly within same channel,'
' and don\'t all have same exact timestamp. Default is %(default)ss (0 - disable).')
group = parser.add_argument_group('State storage/init')
group.add_argument('-s', '--state-dir', default='state', metavar='path',
help='Directory to use for storing per-feed "last check" timestamps. Default: %(default)s')
group.add_argument('-t', '--new-feed-time',
type=float, metavar='posix-ts',
help='Timestamp to fetch entries after for new/unknown feeds.'
' Default or 0 is to generate notifications for all entries in new feeds.')
group = parser.add_argument_group('Check filtering and rate-limiting')
group.add_argument('-n', '--feed-name', metavar='name',
help='Name (part) of a specific feed to check regardless of timestamps.')
group.add_argument('-m', '--max-checks', type=int, metavar='n',
help='Limit on number of feeds to check in one run. Default or 0 - no limit.')
group.add_argument('-f', '--force', action='store_true',
help='Force-check feeds regardless of timestamps.')
group = parser.add_argument_group('Debug options')
group.add_argument('--debug', action='store_true', help='Verbose operation mode.')
group.add_argument('--dry-run', action='store_true',
help='Run same stuff, but do not send emails or update state.')
opts = parser.parse_args(sys.argv[1:] if args is None else args)
logging.basicConfig(level=logging.DEBUG if opts.debug else logging.WARNING)
log = get_logger('main')
feeds_rss = feeds_opml = list()
if opts.opml:
def _get_outlines(e, feeds=None):
if feeds is None: feeds = list()
for o in e:
if o.attrib.get('xmlUrl'): feeds.append(YTFeed.from_xml(o.attrib))
else: _get_outlines(o, feeds)
return feeds
opml = etree.fromstring(pl.Path(opts.opml).read_text())
feeds_opml = _get_outlines(opml.find('body'))
log.debug('Parsed OPML: feeds={}', len(feeds_opml))
feeds_opml = YTFeedIndex(feeds_opml)
if opts.rss_list:
rss_list = pl.Path(opts.rss_list)
if rss_list.exists():
with rss_list.open() as src:
feeds_rss = list( YTFeed.from_line(line)
for line in src if line.strip() and not line.lstrip().startswith('#') )
log.debug('Parsed RSS-list: feeds={}', len(feeds_rss))
feeds_rss = YTFeedIndex(feeds_rss)
if opts.opml_convert:
if not opts.rss_list: parser.error('-c/--opml-convert requires -r/--rss-list option')
n = 0
with rss_list.open('a') as dst:
for feed in feeds_opml:
if feed in feeds_rss: continue
dst.write(f'{feed.url} {feed.title}\n')
print(f'Added feed: {feed}')
n += 1
print(f'-- added feeds: {n}')
return
feeds = YTFeedIndex(feeds_opml, feeds_rss)
state_log = state_process(opts.state_dir, feeds)
log.debug('Feed Index: feeds={} state-log={}', len(feeds_rss), state_log)
ts, ts_email, ts_new_feed = time.time(), 0, opts.new_feed_time or 0
check_limit = opts.max_checks or 0
if check_limit: check_limit += 1
feed_lookup = opts.feed_name and str_norm(opts.feed_name)
for feed in feeds:
if feed_lookup:
if feed_lookup not in str_norm(feed.title): continue
elif not opts.force and ts < feed.ts_check_next(): continue
feed_log = get_logger(f'feed.{feed.chan}')
feed_log.debug('Fetching feed: {} [ {} ]', feed.title, feed.url)
try: etag, entries = feed_fetch(feed_log, feed.url, feed.etag)
except FeedFetchError as err:
feed_log.error('Failed to fetch feed: {}', err)
etag = entries = None
if entries is None: continue
ts_since = None
if not feed.seen_entries:
ts_since = feed.ts_last_check or ts_new_feed
log.debug( 'Processing feed:'
' entries={} etag={!r} new-since={}', len(entries), etag, ts_since )
emails, ewma_delay, seen_ids = \
feed_process(feed_log, feed, entries, ts, ts_since)
if emails and not opts.dry_run:
log.debug( 'Sending notification emails:'
' count={} delay={:.1f}', len(emails), opts.email_delay )
for e_ts, subject, body in sorted(emails):
time.sleep(max(0, opts.email_delay - (time.time() - ts_email)))
sp.run(
['mail', '-s', subject, opts.email],
input=body.encode(), timeout=4*60, check=True )
ts_email = time.time()
update = dict(
chan=feed.chan, delay_ewma=ewma_delay,
ts_last_check=ts, etag=etag, seen_entries=seen_ids )
if not opts.dry_run: state_update(state_log, ts, feed, update)
if check_limit:
check_limit -= 1
if check_limit <= 0:
log.debug('Stopping due to -m/--max-checks limit')
break
log.debug('Finished')
if __name__ == '__main__': sys.exit(main())