-
Notifications
You must be signed in to change notification settings - Fork 0
/
do-convert-single-file
executable file
·84 lines (72 loc) · 2.02 KB
/
do-convert-single-file
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash
set -e
ensure_slimerjs_profile_template() {
# Create a profile "template" on first run. We'll copy that profile directory
# for every run, which is faster than forcing Firefox to build a fresh one.
if [ ! -d /tmp/slimerjs-profile-template ]; then
mkdir /tmp/slimerjs-profile-template
echo 'slimer.exit()' > /tmp/noop.js
slimerjs \
--profile /tmp/slimerjs-profile-template \
--headless \
/tmp/noop.js
fi
}
reset_slimerjs_profile() {
# Sets /tmp/slimerjs-profile to be a fresh Firefox profile directory.
ensure_slimerjs_profile_template
rm -rf /tmp/slimerjs-profile
cp -a /tmp/slimerjs-profile-template /tmp/slimerjs-profile
}
convert_html() {
# Output 0.blob and 0.txt
reset_slimerjs_profile
mv input.blob input.html
# Output 0.blob (PDF) and 0.txt
slimerjs \
--profile /tmp/slimerjs-profile \
--headless \
/app/convert.js
}
convert_rtf() {
Ted --saveTo input.blob 0.txt
# --saveTo uses document paper size; --printToFile uses letter paper size.
# We want --saveTo
Ted --saveTo input.blob 0.ps
gs \
-sDEVICE=pdfwrite \
-sOutputFile=0.blob \
-dSAFER \
-dBATCH \
-dNOPAUSE \
-dNOPROMPT \
-dQUIET \
-dCompatibilityLevel=1.4 \
-q \
-sDocumentUUID=00000000-0000-0000-0000-000000000000 \
-sInstanceUUID=00000000-0000-0000-0000-000000000000 \
-c "100000000 setvmthreshold" \
-f0.ps
}
# Put HTML input in "input.html", converting if needed
CONTENT_TYPE=$(echo "$1" | jq -r .contentType)
case "$CONTENT_TYPE" in
application/rtf)
convert_rtf
;;
*)
convert_html
;;
esac
# SlimerJS renders screenshots, but they're of the _page_, not the _PDF_. We'll
# generate a screenshot of the PDF.
#
# Ditto for RTF conversion
pdftocairo \
-jpeg \
-singlefile \
-scale-to 700 \
0.blob \
0-thumbnail
# Output JSON
echo "$1" | jq '{ filename: .filename, languageCode: .languageCode, wantOcr: false, wantSplitByPage: .wantSplitByPage, contentType: "application/pdf", metadata: .metadata }' > 0.json