forked from pymupdf/PyMuPDF-Utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean-cont.py
65 lines (61 loc) · 2.45 KB
/
clean-cont.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
Utility script
--------------
If a PDF has pages with more than one /Contents object, combine them into one.
This either done with the 'clean' option of 'save' (which invokes an MuPDF
function), or just concatenate the streams.
The latter option is used, if no contents streams are in use by more than one
page. The MuPDF function does additional syntax checking of the streams as well
as general consistency checks.
"""
from __future__ import print_function
import sys, time
import fitz
if not tuple(map(int, fitz.version[0].split("."))) >= (1, 13, 5):
raise SystemExit("require PyMuPDF v1.13.5+")
t0 = time.time()
doc = fitz.open(sys.argv[1])
if not doc.isPDF:
raise SystemExit("Only works for PDF.")
clist = [] # all contents xref numbers
print(
"\nChecking file '%s' (%i pages) for multiple /Contents.\n" % (doc.name, len(doc))
)
for page in doc:
clist.extend(page._getContents())
if len(clist) > len(doc): # some pages have more than one!
print(
"There exist pages with multiple /Contents (%i : %i)." % (len(clist), len(doc))
)
if len(clist) != len(set(clist)): # there are duplicate xrefs!
print("Re-used /Contents exist -> using MuPDF 'clean'.")
doc.save(
"cleaned-" + doc.name,
garbage=2,
clean=True, # use the standard clean function
deflate=True, # recompress contents objects
)
else: # each page has its own contents
print("All /Contents are used only once - combining multiples.")
pcount = 0
for page in doc:
xrefl = page._getContents()
if len(xrefl) < 2: # page has only one contents
continue
pcount += 1
print("cleaning page %i with %i objects" % (page.number, len(xrefl)))
c = b"" # the combined contents area
for xref in xrefl:
c += doc._getXrefStream(xref) # concat all contents and ...
doc._updateStream(xrefl[0], c) # ... put result in first one
page._setContents(xrefl[0]) # reflect this in page defin.
print("Content of %i pages cleaned." % pcount)
doc.save(
"cleaned-" + doc.name,
garbage=2, # remove unused & compact XREF
deflate=True,
)
else:
print("Nothing to do: all pages have only one /Contents object.")
t1 = time.time()
print("Elapsed time %g seconds" % (t1 - t0))