-
Notifications
You must be signed in to change notification settings - Fork 4
/
extract-ebooklib.py
161 lines (127 loc) · 4.28 KB
/
extract-ebooklib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#! /usr/bin/env python3
"""
EPUB text extraction with Ebooklib demo
Requires Ebooklib:
https://github.com/aerkalov/ebooklib
"""
import os
import sys
import csv
import argparse
from html.parser import HTMLParser
import ebooklib
from ebooklib import epub
# Create argument parser
argParser = argparse.ArgumentParser(
description="Extract text from EPUB files")
def parseCommandLine():
"""Parse command-line arguments"""
argParser.add_argument('dirIn',
action="store",
type=str,
help='directory with input EPUB files')
argParser.add_argument('dirOut',
action='store',
type=str,
help='output directory')
# Parse arguments
args = argParser.parse_args()
return args
def errorExit(msg):
"""Print error message and exit"""
sys.stderr.write("ERROR: " + msg + "\n")
sys.exit(1)
def errorInfo(msg):
"""Print error message"""
sys.stderr.write("ERROR: " + msg + "\n")
class HTMLFilter(HTMLParser):
"""
Source: https://stackoverflow.com/a/55825140/1209004
"""
text = ""
def handle_data(self, data):
self.text += data
def extractEbooklib(fileIn, fileOut):
"""Extract text from input file using Ebooklib
and write result to output file"""
# Word count
noWords = 0
# Try to parse the file with Ebooklib, and report an error message if
# parsing fails
try:
book = epub.read_epub(fileIn)
content = ""
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
bodyContent = item.get_body_content().decode()
f = HTMLFilter()
f.feed(bodyContent)
content += f.text
successParse = True
except Exception:
successParse = False
msg = "error parsing " + fileIn
errorInfo(msg)
# Write extracted text to a text file if parsing was successful
if successParse:
# Word count
noWords = len(content.split())
try:
with open(fileOut, 'w', encoding='utf-8') as fout:
fout.write(content)
except UnicodeError:
msg = "Unicode error on writing " + fileOut
errorInfo(msg)
except OSError:
msg = "error writing " + fileOut
errorInfo(msg)
except Exception:
msg = "unknown error writing " + fileOut
errorInfo(msg)
return noWords
def main():
"""Main command line interface"""
# Get command line arguments
args = parseCommandLine()
dirIn = args.dirIn
dirOut = args.dirOut
# Check if input and output directories exist, and exit if not
if not os.path.isdir(dirIn):
msg = "input dir doesn't exist"
errorExit(msg)
if not os.path.isdir(dirOut):
msg = "output dir doesn't exist"
errorExit(msg)
# Summary output file
csvOut = os.path.join(dirOut, "summary-ebooklib.csv")
csvList = [["fileName", "noWords"]]
# Iterate over files in input directory
for filename in os.listdir(dirIn):
fIn = os.path.abspath(os.path.join(dirIn, filename))
if os.path.isfile(fIn):
# Get base name and extension for each file
baseName = os.path.splitext(filename)[0]
extension = os.path.splitext(filename)[1]
# Only process files with .epub extension (case-insensitive,
# just to be safe)
if extension.upper() == ".EPUB":
fOutTextract = os.path.join(dirOut, baseName + "_ebooklib.txt")
noWords = extractEbooklib(fIn, fOutTextract)
csvList.append([filename, noWords])
# Write summary file
try:
with open(csvOut, 'w', encoding='utf-8') as csvout:
csvWriter = csv.writer(csvout)
for row in csvList:
csvWriter.writerow(row)
except UnicodeError:
msg = "Unicode error on writing " + csvOut
errorInfo(msg)
except OSError:
msg = "error writing " + csvOut
errorInfo(msg)
except Exception:
msg = "unknown error writing " + csvOut
errorInfo(msg)
if __name__ == "__main__":
main()