-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalidate_pdf.py
executable file
·67 lines (51 loc) · 1.86 KB
/
validate_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python
import os, filecmp, shutil, time, math
from pyPdf import PdfFileReader
#from pyPDF2 import PdfFileReader
BASE_DIR = "/home/content/uploaded/pdfs"
BAD_DIR = "/home/content/uploaded/pdfs/badfiles"
GOOD_DIR = "/home/content/uploaded/pdfs/goodfiles"
ignore_dirs = [BAD_DIR,GOOD_DIR]
file_list = []
for dirpath, dirnames, filenames in os.walk(BASE_DIR):
if dirpath not in ignore_dirs:
for files in filenames:
file_list.append(os.path.join(dirpath,files))
file_dict = {}
name_dict = {}
ext_list = []
num_pages = []
counter = 0
for files in file_list:
raw_path,raw_name = os.path.split(files)
raw_base,raw_ext = os.path.splitext(raw_name)
base = raw_base.lower()
ext = raw_ext.lower()
counter += 1
if ext == '.pdf':
print "trying " + files
try:
file1 = open(files, "rb")
doc = PdfFileReader(file1)
num_pages.append(doc.getNumPages())
file2 = os.path.join(GOOD_DIR,raw_name)
print "Moving " + files + " to " + file2
shutil.move(files,file2)
except:
file2 = os.path.join(BAD_DIR,raw_name)
print "Moving " + files + " to " + file2
shutil.move(files,file2)
file1.close()
total_pages = 0
for number in num_pages:
total_pages += number
ave_pages = float(total_pages)/float(len(num_pages))
deviation_sq_sum = 0
for number in num_pages:
deviation_sq_sum += (number - ave_pages)*(number - ave_pages)
standard_deviation = math.sqrt(float(deviation_sq_sum)/float(len(num_pages)))
standard_error = standard_deviation/math.sqrt(len(num_pages))
print "-----------------------------"
print "Total number of PDFs {:10}".format(counter)
print "Total Number of pages {:10}".format(total_pages)
print "Average Number of pages {:.4} ".format(ave_pages) + " +/- {:.4}".format(standard_error)