forked from pymupdf/PyMuPDF-Utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanonymize.py
50 lines (47 loc) · 1.8 KB
/
anonymize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from __future__ import print_function
import fitz, sys
"""
PyMuPDF utility
----------------
Scan through all pages of a PDF and remove all text. The metadata dictionary
will also be cleared with "none" values. Any XML-based metadata will also be
deleted. The resulting PDF will be save under the name "anonymous.pdf" in the
same directory.
"""
def remove_txt(cont):
"""Remove everything enclosed in a pair of "BT" / "ET" strings, including both. Assuming "cont" is the string of a PDF "/Contents" stream, this will make all text of the owning page disappear (permanent delete). "cont" is a bytes object in Python 3, so we will turn it into a string first in that case.
"""
if type(cont) is bytes and str is not bytes:
cont1 = (cont.decode()).replace("\n", " ")
else:
cont1 = cont.replace("\n", " ")
ct = cont1.split(" ")
nct = []
intext = False
for word in ct:
if word == "ET":
intext = False
continue
if word == "BT":
intext = True
continue
if intext:
continue
nct.append(word)
ncont = " ".join(nct)
if str is not bytes: # caller expects bytes if Python 3
ncont = bytes(ncont, "utf-8")
return ncont
assert len(sys.argv) == 2, "need input PDF file name"
fn = sys.argv[1]
assert fn.endswith(".pdf"), "expect a PDF file"
doc = fitz.open(fn)
doc.setMetadata({}) # set metadata values to "none"
doc._delXmlMetadata() # delete any XML metadata
for page in doc:
xref_lst = page._getContents()
for xref in xref_lst:
cont = doc._getXrefStream(xref)
ncont = remove_txt(cont)
doc._updateStream(xref, ncont)
doc.save("anonymous.pdf", clean = True, garbage = 4)