forked from nimaid/bookdir2pdf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bookdir2pdf.py
966 lines (799 loc) · 37 KB
/
bookdir2pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
#!/usr/bin/env python3
import argparse, os, sys
from pathlib import Path
import re
import atexit
# Test if this is a PyInstaller executable or a .py file
if getattr(sys, 'frozen', False):
IS_EXE = True
PROG_FILE = sys.executable
PROG_PATH = os.path.dirname(PROG_FILE)
PATH = sys._MEIPASS
else:
IS_EXE = False
PROG_FILE = os.path.realpath(__file__)
PROG_PATH = os.path.dirname(PROG_FILE)
PATH = PROG_PATH
PROG_FILE_NAME = os.path.splitext(os.path.basename(PROG_FILE))[0]
# Get path that the command was called from
COMMAND_PATH = Path().absolute()
# Setup exit handler (used for cleanup)
job_complete = None
exit_funcs = list()
def exit_handler():
global job_complete
if len(exit_funcs) > 0:
print()
print("-------- CLEANUP --------")
for f in exit_funcs:
f()
if job_complete != None: # defined at the very end only
job_complete()
atexit.register(exit_handler)
# Parse arguments before running main program
def dir_path(string):
if os.path.isdir(string):
return string
else:
raise NotADirectoryError(string)
#TODO: Add usage examples
ap = argparse.ArgumentParser(description="Merge nested image directory into PDF with nested bookmarks.")
ap.add_argument("-i", "--input_dir", type=dir_path, required=True,
help="path to nested image directory to merge")
ap.add_argument("-o", "--output_file", type=str, default=None,
help="output file path ( defaults to [input_dir].pdf )")
ap.add_argument("-s", "--order_number_separator", type=str, default=None,
help="the character used to separate the directory ordering numbers from the bookmark names ( like '.' or ')' )")
ap.add_argument("-n", "--no_pdf", action="store_true",
help="just scan directory and print table of contents")
ap.add_argument("-p", "--purify", action="store", default=None, nargs="*", type=str,
help="purify scanned B&W page ( greyscale, sharpen, threshold ), named sub-arguments: (sharpen|s) (threshold|t)")
ap.add_argument("-d", "--dpi", type=int, default=None,
help="dots-per-inch of the input images")
ap.add_argument("-t", "--title", type=str, default=None,
help="the PDF title ( defaults to the directory basename )")
ap.add_argument("-a", "--author", type=str, default=None,
help="the PDF author")
ap.add_argument("-f", "--table_of_contents_format", action="store", default=None, nargs="*", type=str,
help="formatting options for the table of contents, named sub-arguments: (break_limit|b) (number_prefix|p) (number_postfix|a) (indent|i)")
args = vars(ap.parse_args())
print()
# Resolve input dir into absolute path (relative to working directory!)
input_dir = args["input_dir"]
input_dir = os.path.realpath(input_dir)
if not os.path.isabs(input_dir):
input_dir_split = input_dir.split(os.path.sep)
if input_dir_split[0] == os.path.curdir:
input_dir = os.path.sep.join(input_dir_split[1:])
input_dir = os.path.join(COMMAND_PATH, input_dir)
input_dir_name = input_dir.strip(os.path.sep).split(os.path.sep)[-1]
# Get main directory
main_dir = os.path.sep.join(input_dir.rstrip(os.path.sep).split(os.path.sep)[:-1])
# Test if/which purify flavor is being used
if args["purify"] != None:
purify = True
purify_args = args["purify"]
else:
purify = False
purify_args = ()
# Do not purify if --no_pdf is set
if args["no_pdf"]:
purify = False
# Parse purify sub-arguments (values)
if purify:
# Defaults
sharpen_factor = 2
thresh_setting = 170
for p_arg in purify_args:
p_arg_split = p_arg.split("=")
# Only allow [string]=[string]
if len(p_arg_split) != 2:
raise argparse.ArgumentTypeError("Invalid argument format. Use arg_name=arg_value.")
# Get name and value separately
p_arg_name, p_arg_value = [x.lower().strip() for x in p_arg_split]
# Parse purify named sub-arguments
if p_arg_name in ["sharpen", "s"]:
# Test if it's a float and set
worked = True
try:
sharpen_factor = float(p_arg_value)
except(ValueError):
worked = False
# Test if it's greater than 0
if sharpen_factor <= 0:
worked = False
if not worked:
raise argparse.ArgumentTypeError("(--purify | -p) sharpness must be a float greater than 0.")
elif p_arg_name in ["threshold", "t"]:
# Test if it's a float and set
worked = True
try:
thresh_setting = float(p_arg_value)
except(ValueError):
worked = False
# Test if it's positive
if thresh_setting < 0:
worked = False
# Test if it's <= 255
if thresh_setting > 255:
worked = False
if not worked:
raise argparse.ArgumentTypeError("(--purify | -p) threshold must be a positive float <= 255.")
else:
raise argparse.ArgumentTypeError("'{}' is not a valid option for (--purify | -p).".format(p_arg_name))
# Fuction to read the first line in a text file
def read_string_from_file(string_file_name):
with open(string_file_name, 'r', encoding='utf-8') as f:
result = f.read()
return result.strip().split("\n")[0].strip()
# Function to make any string into a valid filename
def get_valid_filename(s):
s = str(s).strip()
return re.sub(r'(?u)[\/\\\:\*\?\"\<\>\|]', '_', s)
# Function to turn a path into the name after the os.path.extsep (even if no name)
def path_to_ext(path_in):
path_in_path, path_in_filename = os.path.split(path_in)
path_in_basename, path_in_ext = os.path.splitext(path_in_filename)
path_in_ext = path_in_ext.lower()
if path_in_ext == "":
# No name, just extention
path_in_ext = path_in_basename
if len(path_in_ext) <= 0:
# Must be a directory
return None
elif path_in_ext[0] != ".":
# No actual extention
return ""
else:
return path_in_ext
# Set file extentions to ignore
ignored_file_exts = [".ignore", ".db"]
# Set valid image extentions
page_exts = [".jpg", ".jpeg", ".png", ".gif"]
# Set rename extentions
rename_exts = [".name", ".title"]
# Set author extentions
author_exts = [".author"]
# Set DPI extentions
dpi_exts = [".dpi"]
# Set blank page extentions
blank_exts = [".blank"]
page_exts += blank_exts
metadata_file_exts = rename_exts + author_exts + dpi_exts
valid_exts = ignored_file_exts + page_exts + metadata_file_exts + blank_exts
# Get files in main input directory
input_dir_files = [str(p) for p in Path(input_dir).glob("*") if os.path.isfile(p)]
# Set/limit DPI
dpi_files = [p for p in input_dir_files if path_to_ext(p) in dpi_exts]
if len(dpi_files) > 1:
raise argparse.ArgumentTypeError("Multiple DPI files found in the main directory! Please use at most 1.")
if args["dpi"] != None:
pdf_dpi = args["dpi"]
if len(dpi_files) > 0:
print("[WARNING]: A DPI file exists in the main directory, but the --dpi argument overrides this.")
elif len(dpi_files) > 0:
try:
pdf_dpi = int(read_string_from_file(dpi_files[0]))
except ValueError:
argparse.ArgumentTypeError("DPI setting from DPI file must be a valid integer.")
else:
pdf_dpi = int(300)
if (pdf_dpi < 72) or (pdf_dpi > 4800):
raise argparse.ArgumentTypeError("DPI must be 72 <= DPI <= 4800. Current setting: '{}'".format(pdf_dpi))
# Set PDF title
title_files = [p for p in input_dir_files if path_to_ext(p) in rename_exts]
if len(title_files) > 1:
raise argparse.ArgumentTypeError("Multiple title/name files found in the main directory! Please use at most 1.")
use_pdf_title = True
if args["title"] != None:
if len(title_files) > 0:
print("[WARNING]: A title/name file exists in the main directory, but the --title argument overrides this.")
pdf_title = args["title"]
elif len(title_files) > 0:
pdf_title = read_string_from_file(title_files[0])
else:
pdf_title = input_dir_name
use_pdf_title = False
# Set PDF author
author_files = [p for p in input_dir_files if path_to_ext(p) in author_exts]
if len(author_files) > 1:
raise argparse.ArgumentTypeError("Multiple author files found in the main directory! Please use at most 1.")
if args["author"] != None:
if len(author_files) > 0:
print("[WARNING]: An author file exists in the main directory, but the --author argument overrides this.")
pdf_author = args["author"].strip()
elif len(author_files) > 0:
pdf_author = read_string_from_file(author_files[0])
else:
pdf_author = ""
# Resolve output filename
if not args["no_pdf"]:
if args["output_file"] == None:
# Default to title
pdf_title_safe_filename = get_valid_filename(pdf_title)
output_file = os.path.join(main_dir, pdf_title_safe_filename) + os.path.extsep + "pdf"
else:
out_dir, out_name = os.path.split(args["output_file"])
out_name_split = out_name.split(os.path.extsep)
if len(out_name_split) >= 2:
# There is an extension
output_file = args["output_file"]
if out_name_split[-1].lower() != "pdf":
output_file += os.path.extsep + "pdf"
else:
# No extension provided
output_file = args["output_file"] + os.path.extsep + "pdf"
output_file = os.path.realpath(output_file)
output_file_dir, output_file_name = os.path.split(output_file)
# Test if special ToC formatting is used
if args["table_of_contents_format"] != None:
tocf_args = args["table_of_contents_format"]
else:
tocf_args = ()
# Defaults
toc_line_break_limit = None
pagenum_pre = "Page #"
pagenum_post = " "
ident_str = "--- "
# Parse ToC formatting sub arguments
for tocf_arg in tocf_args:
tocf_arg_split = tocf_arg.split("=")
# Only allow [string]=[string]
if len(tocf_arg_split) != 2:
raise argparse.ArgumentTypeError("Invalid argument format. Use arg_name=arg_value.")
# Get name and value separately
tocf_arg_name, tocf_arg_value = [x.lower() for x in tocf_arg_split]
# Parse ToC formatting named sub-arguments
if tocf_arg_name in ["break_limit", "b"]:
# Test if it's an int and set
worked = True
try:
toc_line_break_limit = int(tocf_arg_value.strip())
except(ValueError):
worked = False
# Test if it's greater than min_toc_line_break_limit
min_toc_line_break_limit = 10
if toc_line_break_limit == 0:
toc_line_break_limit = None
workied=True
elif toc_line_break_limit <= min_toc_line_break_limit:
worked = False
if not worked:
raise argparse.ArgumentTypeError("(--break_limit | -b) length must be an integer greater than {}, or 0 for no limit.".format(min_toc_line_break_limit))
elif tocf_arg_name in ["number_prefix", "p"]:
pagenum_pre = tocf_arg_value
elif tocf_arg_name in ["number_postfix", "a"]:
pagenum_post = tocf_arg_value
elif tocf_arg_name in ["indent", "i"]:
ident_str = tocf_arg_value
else:
raise argparse.ArgumentTypeError("'{}' is not a valid option for (--break_limit | -b).".format(tocf_arg_name))
# Parse purify sub-arguments (values)
if purify:
# Defaults
sharpen_factor = 2
thresh_setting = 170
for p_arg in purify_args:
p_arg_split = p_arg.split("=")
# Only allow [string]=[string]
if len(p_arg_split) != 2:
raise argparse.ArgumentTypeError("Invalid argument format. Use arg_name=arg_value.")
# Get name and value separately
p_arg_name, p_arg_value = [x.lower().strip() for x in p_arg_split]
# Parse purify named sub-arguments
if p_arg_name in ["sharpen", "s"]:
# Test if it's a float and set
worked = True
try:
sharpen_factor = float(p_arg_value)
except(ValueError):
worked = False
# Test if it's greater than 0
if sharpen_factor <= 0:
worked = False
if not worked:
raise argparse.ArgumentTypeError("(--purify | -p) sharpness must be a float greater than 0.")
elif p_arg_name in ["threshold", "t"]:
# Test if it's a float and set
worked = True
try:
thresh_setting = float(p_arg_value)
except(ValueError):
worked = False
# Test if it's positive
if thresh_setting < 0:
worked = False
# Test if it's <= 255
if thresh_setting > 255:
worked = False
if not worked:
raise argparse.ArgumentTypeError("(--purify | -p) threshold must be a positive float <= 255.")
else:
raise argparse.ArgumentTypeError("'{}' is not a valid option for (--purify | -p).".format(p_arg_name))
# Print main program warnings
if args["no_pdf"] and args["purify"] != None:
print("[WARNING]: Both (--purify|-p) and (--no_pdf|-n) arguments were passed, will not make PDF.")
# Print settings
print()
print("-------- PDF SETTINGS --------")
pdf_no_title_string = "PDF will have no title."
if use_pdf_title:
if len(pdf_title) <= 0:
print(pdf_no_title_string)
else:
print("PDF title: {}".format(pdf_title))
else:
print(pdf_no_title_string)
if len(pdf_author) <= 0:
print("PDF will have no author.")
else:
print("PDF author: {}".format(pdf_author))
if not args["no_pdf"]:
print("PDF resolution: {} DPI".format(pdf_dpi))
print("Input directory: {}".format(input_dir))
if args["no_pdf"]:
print("Will only print the Table of Contents, will NOT process images or save PDF.")
else:
# Print target filename
print("Output filename: {}".format(output_file))
if purify:
print("Will purify documents:")
print("\tSharpening factor: {}".format(sharpen_factor))
print("\tThreshold: {}.".format(thresh_setting))
if args["table_of_contents_format"] != None:
print("Table of Contents formatting:")
print("\tName length break limit: {}".format(toc_line_break_limit))
print("\tPage number prefix: '{}'".format(pagenum_pre))
print("\tPage number postfix: '{}'".format(pagenum_post))
print("\tIndent text: '{}'".format(ident_str))
# We will be catching KeyboardInterrupts
try:
# Do main imports
import img2pdf
from PIL import Image, ImageEnhance
from collections import OrderedDict
from PyPDF2 import PdfFileWriter, PdfFileReader
import shutil
import textwrap
print()
print("-------- DIRECTORY SCANNING --------")
print("Scanning directory: '{}'...".format(input_dir))
# Walk though folder structure (recursive alphabetical, include all files/folders)
input_dir_list = [str(p) for p in sorted(Path(input_dir).glob('**/*'))]
# Prefix to prepend to temporary file/folder names
temp_name_prepend = "__temp__"
# The temporary directory name
temp_dir_name = temp_name_prepend + input_dir_name
temp_dir = os.path.join(main_dir, temp_dir_name)
if not args["no_pdf"]:
# Delete temp dir if it exists
def clean_temp_dir():
if os.path.exists(temp_dir):
if os.path.isdir(temp_dir):
shutil.rmtree(temp_dir)
elif os.path.isfile(temp_dir):
os.remove(temp_dir)
clean_temp_dir()
# Make temp directory
os.mkdir(temp_dir)
# Set up exit function to clean up if exited early
def exit_clean_temp_dir():
print("Delete temporary directory: {}".format(temp_dir))
clean_temp_dir()
print("\tDone!")
exit_funcs.append(exit_clean_temp_dir)
# Make final input dir names
if purify:
# Temp directory
final_input_dir = temp_dir
else:
# The original directory
final_input_dir = input_dir
# Save image paths (and empty/ignored-file dirs) paths to ordered list
page_list = list()
# Make dict with rename (dir, bm_name)
page_dir_rename_dict = dict()
for p in input_dir_list:
if os.path.isfile(p):
# Check if it's an invalid extention, and if so, fully ignore it
p_ext = path_to_ext(p)
if p_ext not in valid_exts:
print("[UNSUPPORTED]: {}".format(p))
continue
# Test if it's a metadata file, and if so, fully ignore
if p_ext in metadata_file_exts:
continue
# Test if it should be ignored, and if so, fully ignore it
if p_ext in ignored_file_exts:
print("[IGNORING]: {}".format(p))
continue
# Test if the path length is nearing the Windows limit
windows_path_limit = 260
unix_path_limit = 4096
wiggle_room = 40
p_path_length = len(p)
min_path_length = min(windows_path_limit, unix_path_limit)
if p_path_length > min_path_length - wiggle_room:
print("[WARNING] Dangerously long pathname: {}".format(p))
print("\tPath length: {} characters".format(p_path_length))
print("\tWindows maximum path length: {} characters".format(windows_path_limit))
print("\tUnix maximum path length: {} characters".format(unix_path_limit))
print("\tRenaming, moving, or downloading this folder may cause errors unless you shorten the names of the file/folders.")
print("\Recommended action: Use '.name' files (instead of the folder names) to define bookmark names.")
page_list.append(p)
elif os.path.isdir(p):
# Get files and directories inside p
p_list = [os.path.realpath(str(x)) for x in sorted(Path(p).glob('*'))]
p_dir_list = [x for x in p_list if os.path.isdir(x)]
p_file_list = [x for x in p_list if os.path.isfile(x)]
# Make file list without ignored files
p_file_list_ignored = list()
# Also update page_dir_rename_dict
for x in p_file_list:
x_path, x_filename = os.path.split(x)
x_ext = path_to_ext(x)
# Check if it's a valid extention
if x_ext in valid_exts:
# If not ignored file, append to file list
if x_ext not in ignored_file_exts + metadata_file_exts:
p_file_list_ignored.append(x)
# If it's a rename file, parse and append to rename dict
if x_ext in rename_exts:
# If purify, make sure it's referencing the final input dir
if purify:
relative_x_path = os.path.relpath(x_path, input_dir)
rename_dir = os.path.join(final_input_dir, relative_x_path)
else:
rename_dir = x_path
rename_name = read_string_from_file(x)
page_dir_rename_dict[rename_dir] = rename_name
# Test if it's empty or contains only ignored files
if len(p_dir_list) <= 0 and len(p_file_list_ignored) <= 0:
# Add path (used to make "empty" bookmarks)
page_list.append(p)
print("\tDone scanning directory!")
# Get number of pages
num_pages = len([p for p in page_list if os.path.isfile(p)])
num_image_pages = len([p for p in page_list if os.path.isfile(p) and os.path.splitext(p)[-1] not in blank_exts])
num_pages_len = len(str(num_pages))
num_image_pages_len = len(str(num_image_pages))
print("\tPage count: {}".format(num_pages))
# Run purification (save to temporary directory)
if purify and num_image_pages > 0:
print()
print("-------- PURIFICATION --------")
print("Saving purified images to temporary directory: {}".format(final_input_dir))
# Delete temp dir (or file with same name) if it already exists
clean_temp_dir()
# Create new page_list
new_page_list = list()
new_page_list_dirs = list()
new_page_list_files = list()
for p in page_list:
# Make final_p (replace input_dir with final_input_dir)
rel_p = os.path.relpath(p, input_dir)
final_p = os.path.join(final_input_dir, rel_p)
new_page_list.append(final_p)
if os.path.isdir(p):
new_page_list_dirs.append(final_p)
else:
new_page_list_files.append(final_p)
# Make all directories first
for p in new_page_list_dirs:
Path(p).mkdir(parents=True, exist_ok=True)
for p in new_page_list_files:
Path(os.path.dirname(p)).mkdir(parents=True, exist_ok=True)
# Process image files
curr_page = 0 # Will go to 1 before first print
for x, p in enumerate(page_list):
final_p = new_page_list[x]
if os.path.isfile(p):
# It's a page file
if os.path.splitext(p)[-1] in blank_exts:
# It's a blank page file, just copy
shutil.copy(p, final_p)
else:
# It's an image file, process
curr_page += 1
with Image.open(p) as page_im:
if purify:
curr_page_str = str(curr_page).rjust(num_image_pages_len)
print("[PURIFY] ({}/{}): {}".format(curr_page_str, num_image_pages, p))
# Make greyscale
gray = page_im.convert('L')
# Sharpen
enhancer = ImageEnhance.Sharpness(gray)
sharpen = enhancer.enhance(sharpen_factor)
# Apply threshold
thresh = sharpen.point(lambda p: p > thresh_setting and 255)
# Make 1 bit
final_page_im = thresh.convert('1')
else:
final_page_im = page_im
# Save image
final_page_im.save(final_p, "PNG", dpi=(pdf_dpi, pdf_dpi))
# Update page_list with new images/paths
page_list = new_page_list
print("\tDone purifying images!")
# Get size from first image
pl_files_images = [p for p in page_list if os.path.isfile(p) and os.path.splitext(p)[-1] not in blank_exts]
if num_image_pages > 0:
with Image.open(pl_files_images[0]) as cover:
width, height = cover.size
else:
width = int(pdf_dpi * 8.5) # Assume 8.5in
height = int(pdf_dpi * 11) # Assume 11in
# Get number of blank pages
num_blank_pages = num_pages - num_image_pages
# Test if blank pages are used
if num_blank_pages > 0:
blanks_used = True
else:
blanks_used = False
if blanks_used and not args["no_pdf"]:
print()
print("-------- BLANK PAGES --------")
print("Saving blank page images to temporary directory: {}".format(final_input_dir))
# Make pure white images for .blank files
#TODO: Figure out how to make truly blank pages (or at least smaller file sizes)
new_page_list = list()
blank_page_num = 0 # Goes to 1 on first page
for p in page_list:
page_basename = os.path.relpath(p, input_dir)
page_temp = os.path.realpath(os.path.join(temp_dir, page_basename))
if os.path.isfile(p):
if os.path.splitext(p)[-1] in blank_exts:
blank_page_num += 1
# Make a white PNG in the temporary folder
blank_page_name = os.path.splitext(page_basename)[0] + ".png"
blank_page_filename = os.path.realpath(os.path.join(temp_dir, blank_page_name))
curr_page_str = str(blank_page_num).rjust(len(str(num_blank_pages)))
print("[BLANK] ({}/{}): {}".format(curr_page_str, num_blank_pages, blank_page_filename))
blank_page = Image.new('1', (width, height), color = "white")
Path(os.path.dirname(blank_page_filename)).mkdir(parents=True, exist_ok=True)
blank_page.save(blank_page_filename, "PNG", dpi=(pdf_dpi, pdf_dpi))
new_page_list.append(blank_page_filename)
else:
# Copy image to temp (fix extra bookmark due to relative directory error)
if p != page_temp:
Path(os.path.dirname(page_temp)).mkdir(parents=True, exist_ok=True)
shutil.copy(p, page_temp)
new_page_list.append(page_temp)
else:
# Make directory in temp (fixes error mentioned above)
Path(page_temp).mkdir(parents=True, exist_ok=True)
new_page_list.append(page_temp)
page_list = new_page_list
print("\tDone creating blank pages!")
# Make page_list but with only files
page_list_files = [p for p in page_list if os.path.isfile(p)]
# Create nested ordered dictionary from list
page_dict = OrderedDict()
for p in page_list:
p = os.path.relpath(p, final_input_dir) # Make relative
current_level = page_dict
for part in p.split(os.path.sep):
if part not in current_level:
current_level[part] = OrderedDict()
current_level = current_level[part]
# Create PDF from page_list(no bookmarks)
if not args["no_pdf"]:
print()
print("-------- PDF CREATION --------")
temp_pdf = os.path.join(output_file_dir, temp_name_prepend + output_file_name)
print("Creating PDF document from image files...")
temp_pdf_file_binary = img2pdf.convert(page_list_files, dpi=pdf_dpi, x=None, y=None)
print("\tDone!")
# Setup exit handler to delete temp PDF
input_pdf_file = None
def exit_clean_temp_pdf():
print("Delete temporary PDF: {}".format(temp_pdf))
if input_pdf_file != None:
input_pdf_file.close()
os.remove(temp_pdf)
print("\tDone!")
exit_funcs.append(exit_clean_temp_pdf)
print("Saving temporary PDF: {}".format(temp_pdf))
with open(temp_pdf, "wb") as f:
f.write(temp_pdf_file_binary)
print("\tDone!")
# Load PDF into PyPDF2
print("Loading temporary PDF into editing library...")
output_pdf = PdfFileWriter()
input_pdf_file = open(temp_pdf, 'rb')
input_pdf = PdfFileReader(input_pdf_file)
output_pdf.appendPagesFromReader(input_pdf)
print("\tDone!")
print()
print("-------- BOOKMARKS --------")
print("Creating bookmark hierarchy from directory structure...")
# Save ToC lines to list
#TODO: Save ToC to file option
toc_dict_list = list()
if len([p for p in page_dict.values() if p != OrderedDict()]) <= 0:
print()
if not args["no_pdf"]:
print("[WARNING]: No subdirectories found, not creating bookmarks.")
else:
print("[ERROR]: No subdirectories found, no table of contents to generate.")
else:
# Add nested bookmarks from page_dict
ident_level = 0
last_page_index = -1 # Because we want the next page to be 0
path_list = list()
bookmark_list = list()
page_ref = None
def iterdict(d, base_path="", empty_parents_in=list()):
global ident_level
global path_list
global last_page_index
global page_ref
global toc_dict_list
for k, v in d.items():
filepath = os.path.join(base_path, os.path.sep.join(path_list))
filename = os.path.join(filepath, k)
filename = os.path.realpath(filename)
# Get parent bookmark
if len(bookmark_list) > 0:
bm_parent = bookmark_list[-1]
else:
bm_parent = None
# Get bookmark name
if filename in page_dir_rename_dict:
# Name is defined in a rename file
bm_name = page_dir_rename_dict[filename]
elif args["order_number_separator"] != None:
# Remove leading order numbers from dir name
k_split = k.split(args["order_number_separator"])
if len(k_split) <= 1:
bm_name = k
else:
bm_name = args["order_number_separator"].join(k_split[1:]).strip(" ")
else:
bm_name = k
page_ref = last_page_index + 1
# Test if it's a file or a directory
if len(v) > 0:
# It's a not-fully-empty dir (pages/folders)
# Get recursive list of files and folders
v_list = [str(x) for x in Path(filename).glob('**/*')]
v_dir_list = [x for x in v_list if os.path.isdir(x)]
v_file_list = [x for x in v_list if os.path.isfile(x) and os.path.splitext(x)[-1] in page_exts]
# Deal with recursively empty folders
empty_parents = list()
if len(v_file_list) <= 0:
# Test if is not a subdir of an empty_parent
is_subdir_of_empty_parent = False
for empty_parent in empty_parents:
if os.path.commonpath([filename, empty_parent]) == empty_parent:
is_subdir_of_empty_parent = True
if not is_subdir_of_empty_parent:
# This is the main parent
page_ref += 1
empty_parents.append(filename)
# Prevent referencing non-existent pages
page_ref = min(page_ref, num_pages - 1)
# Save to toc_dict_list
toc_dict_list.append({
"name": bm_name,
"level": ident_level,
"page": page_ref + 1
})
ident_level += 1
if not args["no_pdf"]:
# Add bookmark w/ parent, save as potential parent
bm = output_pdf.addBookmark(bm_name, page_ref, parent=bm_parent)
# Add to bookmarks list
bookmark_list.append(bm)
path_list.append(k)
# Do recursion
iterdict(v, base_path=base_path, empty_parents_in=empty_parents)
if not args["no_pdf"]:
temp = bookmark_list.pop()
temp = path_list.pop()
ident_level -= 1
else:
# Either it's a file or an empty (placeholder) dir
if os.path.isdir(filename):
# It's a totally empty directory, make an "empty" bookmark (no pages/children, references next page)
# Deal with children of empty parents
empty_parents = empty_parents_in
is_subdir_of_empty_parent = False
for empty_parent in empty_parents:
if os.path.commonpath([filename, empty_parent]) == empty_parent:
is_subdir_of_empty_parent = True
if is_subdir_of_empty_parent:
# Adjust page number forward
page_ref += 1
# Prevent referencing non-existent pages
page_ref = min(page_ref, num_pages - 1)
# Save to toc_dict_list
toc_dict_list.append({
"name": bm_name,
"level": ident_level,
"page": page_ref + 1
})
if not args["no_pdf"]:
# Add bookmark w/ parent, abandon as potential parent
temp = output_pdf.addBookmark(bm_name, page_ref, parent=bm_parent)
else:
# It's a file
page_index = page_list_files.index(filename)
last_page_index = page_index
iterdict(page_dict, base_path=final_input_dir)
print("\tDone!")
if not args["no_pdf"]:
print("Saving bookmarked PDF: {}".format(output_file))
# Create PDF metadata
pdf_metadata_dict = dict()
if use_pdf_title:
pdf_metadata_dict['/Title'] = pdf_title
pdf_metadata_dict['/Author'] = pdf_author
pdf_metadata_dict['/Producer'] = PROG_FILE_NAME
# Add metadata to PDF
output_pdf.addMetadata(pdf_metadata_dict)
# Save final PDF
with open(output_file, 'wb') as f:
output_pdf.write(f)
print("\tDone!")
print()
print("-------- TABLE OF CONTENTS --------")
print("Building Table of Contents from bookmark hierarchy...")
# Make row of ToC function
def make_toc_row(bm_dict_in):
ident = "".join([ident_str for x in range(bm_dict_in["level"])])
page_toc_prefix = pagenum_pre + str(bm_dict_in["page"]).ljust(num_pages_len) + pagenum_post
page_toc_base = page_toc_prefix + ident
final_row = ""
# Break name into multiple lines if it's too long
if toc_line_break_limit != None:
if len(bm_dict_in["name"]) > toc_line_break_limit:
nm_lines = textwrap.fill(bm_dict_in["name"], width=toc_line_break_limit, break_long_words=False).split("\n")
final_row += page_toc_base + nm_lines[0]
for l in nm_lines[1:]:
base_space = "".join([" " for x in range(len(page_toc_base))])
final_row += "\n" + base_space + l
return final_row
# If we didn't break it up, just return it
final_row += page_toc_base + bm_dict_in["name"]
return final_row
# Make rows of ToC
toc_row_list = [make_toc_row(r) for r in toc_dict_list]
# Get max line width
toc_header_width = max([max([len(y) for y in x.split("\n")]) for x in toc_row_list])
# Get ToC title
toc_title_lines = textwrap.fill(pdf_title, width=toc_header_width, break_long_words=False).split("\n")
toc_title = ["\n".join([x.center(toc_header_width) for x in toc_title_lines])]
# Get ToC author (if applicable)
toc_author = list()
if len(pdf_author) > 0:
toc_header_author = "by " + pdf_author
toc_author_lines = textwrap.fill(toc_header_author, width=toc_header_width, break_long_words=False).split("\n")
toc_author = ["\n".join([x.center(toc_header_width) for x in toc_author_lines])]
# Get ToC label
toc_label_lines = textwrap.fill("Table of Contents", width=toc_header_width, break_long_words=False).split("\n")
toc_label = ["\n".join([x.center(toc_header_width) for x in toc_label_lines])]
# Get ToC seperator
toc_sep = ["".join(["-" for x in range(toc_header_width)])]
# Get page count text
toc_pagecount = ["\tPage count: {}".format(num_pages)]
# Make list with all ToC lines
final_toc_list = toc_title + toc_author + toc_label + toc_sep + toc_row_list + toc_pagecount
print("\tDone!")
# Print ToC lines
print()
for r in final_toc_list:
print(r)
# Set to print after other exit tasks
def job_complete():
print()
print("-------- JOB COMPLETE --------")
print("Page count: {}".format(num_pages))
if not args["no_pdf"]:
print("Final PDF location: {}".format(output_file))
print("File size: {} bytes".format(os.path.getsize(output_file)))
except KeyboardInterrupt:
print()
print()
print()
print("[CTRL-C] Exiting...")