-
Notifications
You must be signed in to change notification settings - Fork 482
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] add support for file_bytes argument with managed_file_context() #270
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from contextlib import contextmanager | ||
import io | ||
import os | ||
import sys | ||
|
||
|
@@ -8,12 +10,13 @@ | |
from .core import TableList | ||
from .parsers import Stream, Lattice | ||
from .utils import ( | ||
InvalidArguments, | ||
TemporaryDirectory, | ||
get_page_layout, | ||
get_text_objects, | ||
get_rotation, | ||
is_url, | ||
download_url, | ||
get_url_bytes, | ||
) | ||
|
||
|
||
|
@@ -24,19 +27,33 @@ class PDFHandler(object): | |
|
||
Parameters | ||
---------- | ||
filepath : str | ||
Filepath or URL of the PDF file. | ||
filepath : str | pathlib.Path, optional (default: None) | ||
Filepath or URL of the PDF file. Required if file_bytes is not given | ||
pages : str, optional (default: '1') | ||
Comma-separated page numbers. | ||
Example: '1,3,4' or '1,4-end' or 'all'. | ||
password : str, optional (default: None) | ||
Password for decryption. | ||
file_bytes : io.IOBase, optional (default: None) | ||
A file-like stream. Required if filepath is not given | ||
|
||
""" | ||
|
||
def __init__(self, filepath, pages="1", password=None): | ||
def __init__(self, filepath=None, pages="1", password=None, file_bytes=None): | ||
if is_url(filepath): | ||
filepath = download_url(filepath) | ||
file_bytes = get_url_bytes(filepath) | ||
|
||
if not filepath and not file_bytes: | ||
raise InvalidArguments('Either `filepath` or `file_bytes` is required') | ||
if not filepath: | ||
# filepath must either be passed, or taken from the name attribute | ||
filepath = getattr(file_bytes, 'name') | ||
if not filepath: | ||
msg = ('Either pass a `filepath`, or give the ' | ||
'`file_bytes` argument a name attribute') | ||
raise InvalidArguments(msg) | ||
self.file_bytes = file_bytes # ok to be None | ||
|
||
self.filepath = filepath | ||
if not filepath.lower().endswith(".pdf"): | ||
raise NotImplementedError("File format not supported") | ||
|
@@ -49,6 +66,28 @@ def __init__(self, filepath, pages="1", password=None): | |
self.password = self.password.encode("ascii") | ||
self.pages = self._get_pages(pages) | ||
|
||
@contextmanager | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the meat of it. Variably opens a file handle or passes the bytes through, depending on the case. |
||
def managed_file_context(self): | ||
"""Reads from either the `filepath` or `file_bytes` | ||
attribute of this instance, to return a file-like object. | ||
Closes any open file handles on exit or error. | ||
|
||
Returns | ||
------- | ||
file_bytes : io.IOBase | ||
A readable, seekable, file-like object | ||
""" | ||
if self.file_bytes: | ||
# if we can't seek, write to a BytesIO object that can, | ||
# then seek to the beginning before yielding | ||
if not hasattr(self.file_bytes, 'seek'): | ||
self.file_bytes = io.BytesIO(self.file_bytes.read()) | ||
self.file_bytes.seek(0) | ||
yield self.file_bytes | ||
else: | ||
with open(self.filepath, "rb") as file_bytes: | ||
yield file_bytes | ||
|
||
def _get_pages(self, pages): | ||
"""Converts pages string to list of ints. | ||
|
||
|
@@ -71,7 +110,7 @@ def _get_pages(self, pages): | |
if pages == "1": | ||
page_numbers.append({"start": 1, "end": 1}) | ||
else: | ||
with open(self.filepath, "rb") as f: | ||
with self.managed_file_context() as f: | ||
infile = PdfReader(f, strict=False) | ||
|
||
if infile.is_encrypted: | ||
|
@@ -107,7 +146,7 @@ def _save_page(self, filepath, page, temp): | |
Tmp directory. | ||
|
||
""" | ||
with open(filepath, "rb") as fileobj: | ||
with self.managed_file_context() as fileobj: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like a bug |
||
infile = PdfReader(fileobj, strict=False) | ||
if infile.is_encrypted: | ||
infile.decrypt(self.password) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import os | ||
import io | ||
import re | ||
import random | ||
import shutil | ||
|
@@ -36,6 +36,10 @@ | |
_VALID_URLS.discard("") | ||
|
||
|
||
class InvalidArguments(Exception): | ||
pass | ||
|
||
|
||
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py | ||
def is_url(url): | ||
"""Check to see if a URL has a valid protocol. | ||
|
@@ -66,31 +70,30 @@ def random_string(length): | |
return ret | ||
|
||
|
||
def download_url(url): | ||
"""Download file from specified URL. | ||
def get_url_bytes(url): | ||
"""Get a stream of bytes for url | ||
|
||
Parameters | ||
---------- | ||
url : str or unicode | ||
|
||
Returns | ||
------- | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change is only moderately involved with my feature, but this is an anti-pattern in my opinion.
Trying to maintain this file outside of the context manager provided by |
||
filepath : str or unicode | ||
Temporary filepath. | ||
file_bytes : io.BytesIO | ||
a file-like object that cane be read | ||
|
||
""" | ||
filename = f"{random_string(6)}.pdf" | ||
with tempfile.NamedTemporaryFile("wb", delete=False) as f: | ||
headers = {"User-Agent": "Mozilla/5.0"} | ||
request = Request(url, None, headers) | ||
obj = urlopen(request) | ||
content_type = obj.info().get_content_type() | ||
if content_type != "application/pdf": | ||
raise NotImplementedError("File format not supported") | ||
f.write(obj.read()) | ||
filepath = os.path.join(os.path.dirname(f.name), filename) | ||
shutil.move(f.name, filepath) | ||
return filepath | ||
file_bytes = io.BytesIO() | ||
file_bytes.name = url | ||
headers = {"User-Agent": "Mozilla/5.0"} | ||
request = Request(url, data=None, headers=headers) | ||
obj = urlopen(request) | ||
content_type = obj.info().get_content_type() | ||
if content_type != "application/pdf": | ||
raise NotImplementedError("File format not supported") | ||
file_bytes.write(obj.read()) | ||
file_bytes.seek(0) | ||
return file_bytes | ||
|
||
|
||
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I elected to keep the arguments separate instead of combining them like
pandas.read_csv
(or any of the others) do. Mostly to preserve the existing API kwargsi.e. I did not want to rename this argument
file_path_or_bytes