-
Notifications
You must be signed in to change notification settings - Fork 1
/
content_extraction.py
194 lines (146 loc) · 5.34 KB
/
content_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
"""
General organization of content extraction.
we have datatypes like entry url content html and text
we have extractors between these types, some of which are not really extractors but we'll stick with that name
when we have multiple approaches at a conversion we aggregate the result with keep_first
in recognition of the fact that extractors can fail, they are in charge of catching own exceptions and returning ""
when multiple ways of getting
logging is taken care of elsewhere, but all exceptions are logged
"""
from functools import wraps
from justext import justext, get_stoplist
import mimeparse
import re
from rl_logging import log_call
import requests
from string import printable
import tempfile
import textract
from urlextract import URLExtract
# this defines a partially FailedExtraction, justifies using an alternate method
# but not raising an exception
SHORT_TEXT = 40
def extractor(fun):
"""An extractor doesn't fail, it logs instead and returns an empty string.
"""
@wraps(fun)
def decorated(*args, **kwargs):
try:
return _warn_short(fun(*args, **kwargs))
except Exception as e:
log_call(fun, args, kwargs, exception=e)
return ""
return decorated
@extractor
def _keep_first(*strategies, **kwargs):
"""Try strategies (argumentless callables) in order provided.
If extraction exceeds supplied min_length, return, else try next.
If all strategies have been attempted, return longest extraction."""
assert len(strategies) > 1
min_length = kwargs.get("min_length", SHORT_TEXT)
longest = ""
for fun in strategies:
retval = extractor(fun)()
payload = retval if isinstance(retval, (str, bytes)) else retval[0]
if payload is not None and len(payload) >= min_length:
return payload
if len(payload) > len(longest):
longest = payload
return _warn_short(longest, min_length)
@extractor
def get_html(*, url=None, entry=None):
"""
Give me the content of the url or entry, to display in feed reader
"""
url, entry = _process_get_args(url, entry)
# prefer url access as it goes to real content. Feed entry is often abridged.
return _keep_first(lambda: _url2html(url), lambda: _entry2html(entry))
@extractor
def get_text(*, url=None, entry=None):
"""
Give me text associated with url or entry, for display or ML use
"""
url, entry = _process_get_args(url, entry)
return _keep_first(lambda: _url2text(url), lambda: _entry2text(entry))
def _process_get_args(url, entry):
assert url is not None or entry is not None
if url is None:
url = get_url(entry)
return url, entry
@extractor
def _entry2text(entry):
return _html2text(_entry2html(entry))
def get_url(entry):
url = entry.link
return _entry2url_twitter(entry) if _is_twitter(url) else url
def _entry2url_twitter(entry):
return _get_first_usable_url(_entry2html(entry)) or entry.link
# this needs a custom dec because of return type
@extractor
def _url2content(url, check_html=False):
response = requests.get(url)
doc_type = _get_doc_type(response)
encoding = response.encoding
assert (not check_html) or (doc_type == "html")
content = response.content.decode(encoding) if check_html else response.content
return content, doc_type
@extractor
def _url2html(url):
return _url2content(url, check_html=True)
@extractor
def _url2text(url):
retval = _keep_first(
lambda: _jt_extractor(url=url), lambda: _content2text_te(_url2content(url))
)
assert isinstance(retval, str)
return retval
@extractor
def _jt_extractor(*, url=None, html=None):
assert url or html
if html is None:
html = requests.get(url).content
paragraphs = justext(html, get_stoplist("English"))
return "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate])
@extractor
def _html2text(html):
return _jt_extractor(html=html)
@extractor
def _content2text_te(content):
data, doc_type = content
with tempfile.NamedTemporaryFile(mode="wb", suffix="." + doc_type) as fh:
fh.write(data)
fh.flush()
return textract.process(fh.name, encoding="utf-8").decode("utf-8")
@extractor
def _entry2html(entry):
title = entry.get("title", "")
content = max(
[
entry.get("description", ""),
entry.get("content", [{}])[0].get("value", ""),
entry.get("summary", ""),
],
key=len,
)
# TODO: would it be possible to grab the title for use in other types of extraction?
return title + "." + content
def _get_doc_type(response):
return mimeparse.parse_mime_type(response.headers["Content-Type"])[1]
def _is_twitter(url):
return bool(re.search(r"twitter.com", url)) # quick and dirty
def _is_image(url):
return url.split("/")[-1].split(".")[-1] in set(["jpg", "jpeg", "png"])
def _get_first_usable_url(text):
s = [
url
for url in URLExtract().find_urls(text)
if not _is_twitter(url) and not _is_image(url)
]
return "".join(filter(lambda x: x in set(printable), s[0])) if len(s) > 0 else None
def _warn_short(what, min_length=SHORT_TEXT):
# text = what if isinstance(what, str) else what[0]
# if len(text) < min_length:
# log.warning(
# "".join(tb.format_stack()[-10:]) + "Minimal text extracted: " + text
# )
return what