-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_utils.py
44 lines (36 loc) · 1.4 KB
/
pdf_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
from io import BytesIO, StringIO
import requests # type: ignore
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
import s3_cache
BOT_TOKEN = os.environ.get("SLACK_BOT_TOKEN")
CACHE_NAMESPACE = "pdf_utils"
s3_cache.set_max_size(CACHE_NAMESPACE, 100)
def extract_text_from_pdf(pdf_url):
cached_text = s3_cache.get_cache(CACHE_NAMESPACE, pdf_url)
if cached_text:
return cached_text
response = requests.get(pdf_url, headers={"Authorization": f"Bearer {BOT_TOKEN}"})
if response.status_code != 200:
raise Exception(
f"Error downloading file [{pdf_url}]. HTTP request status: {response.status_code}"
)
pdf_content = BytesIO(response.content)
parser = PDFParser(pdf_content)
document = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
text_output = StringIO()
device = TextConverter(rsrcmgr, text_output, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
text = text_output.getvalue()
device.close()
text_output.close()
s3_cache.set_cache(CACHE_NAMESPACE, pdf_url, text)
return text