forked from jsvine/pdfplumber
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_convert.py
96 lines (79 loc) · 2.66 KB
/
test_convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
import unittest
import pdfplumber
from subprocess import Popen, PIPE
from io import StringIO
import json
import sys
import os
import logging
logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))
def run(cmd):
return Popen(cmd, stdout=PIPE).communicate()[0]
class Test(unittest.TestCase):
@classmethod
def setup_class(self):
self.path = os.path.join(HERE, "pdfs/pdffill-demo.pdf")
self.pdf = pdfplumber.open(self.path, pages=[1, 2, 5])
@classmethod
def teardown_class(self):
self.pdf.close()
def test_json(self):
c = json.loads(self.pdf.to_json())
assert c["pages"][0]["rects"][0]["bottom"] == float(
self.pdf.pages[0].rects[0]["bottom"]
)
def test_json_all_types(self):
c = json.loads(self.pdf.to_json(types=None))
found_types = c["pages"][0].keys()
assert "curves" in found_types
assert "chars" in found_types
assert "lines" in found_types
assert "rects" in found_types
assert "images" in found_types
def test_single_pages(self):
c = json.loads(self.pdf.pages[0].to_json())
assert c["rects"][0]["bottom"] == float(self.pdf.pages[0].rects[0]["bottom"])
def test_additional_attr_types(self):
path = os.path.join(HERE, "pdfs/issue-67-example.pdf")
with pdfplumber.open(path, pages=[1]) as pdf:
c = json.loads(pdf.to_json())
assert len(c["pages"][0]["images"])
def test_csv(self):
c = self.pdf.to_csv()
assert c.split("\r\n")[2] == (
"char,1,45.83,58.826,656.82,674.82,117.18,117.18,135.18,12.996,"
'18.0,12.996,,,,,,TimesNewRomanPSMT,,,,"(0, 0, 0)",,,18.0,,,,,Y,,1,'
)
io = StringIO()
self.pdf.to_csv(io)
io.seek(0)
c_from_io = io.read()
assert c == c_from_io
def test_csv_all_types(self):
c = self.pdf.to_csv(types=None)
assert c.split("\r\n")[1].split(",")[0] == "curve"
def test_cli(self):
res = run(
[
sys.executable,
"-m",
"pdfplumber.cli",
self.path,
"--format",
"json",
"--pages",
"1-2",
"5",
"--indent",
"2",
]
)
c = json.loads(res)
assert c["pages"][0]["page_number"] == 1
assert c["pages"][1]["page_number"] == 2
assert c["pages"][2]["page_number"] == 5
assert c["pages"][0]["rects"][0]["bottom"] == float(
self.pdf.pages[0].rects[0]["bottom"]
)