-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpy_epubtotxt.py
141 lines (124 loc) · 5.06 KB
/
py_epubtotxt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import ebooklib
import os
import json
import PyPDF2
from zipfile import BadZipFile
from ebooklib import epub
from bs4 import BeautifulSoup
from docx import Document
from markdownify import markdownify as md
def read_processed_files_info(file_path):
if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
return {}
try:
with open(file_path, 'r') as file:
return json.load(file)
except json.JSONDecodeError:
print(f"Warning: Could not decode JSON from {file_path}. Starting with an empty dictionary.")
return {}
def write_processed_files_info(file_path, info):
with open(file_path, 'w') as file:
json.dump(info, file)
def pdf2text(pdf_path):
text = ""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
try:
text += page.extract_text() + "\n"
except Exception as e:
print(f"Error extracting text from page in {pdf_path}: {e}")
except Exception as e:
print(f"Error reading PDF file {pdf_path}: {e}")
return text
def epub2thtml(epub_path):
try:
book = epub.read_epub(epub_path,{"ignore_ncx": True})
chapters = []
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
chapters.append(item.get_content())
return chapters
except BadZipFile:
print(f"Bad Zip File: {epub_path} - Skipping this file.")
return []
except Exception as e:
print(f"Error processing {epub_path}: {e}")
return []
def chap2text(chap):
soup = BeautifulSoup(chap, 'html.parser')
return soup.get_text(separator=' ', strip=True)
def thtml2ttext(thtml):
Output = []
for html in thtml:
text = chap2text(html)
Output.append(text)
return Output
def docx_to_markdown(docx_path):
doc = Document(docx_path)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
doc_ text = '\n'.join(full_text)
return md(doc_text)
def epub2text(epub_path):
chapters = epub2thtml(epub_path)
ttext = thtml2ttext(chapters)
return ttext
def process_file(file_path, output_directory, processed_files):
last_modified = os.path.getmtime(file_path)
if file_path in processed_files and processed_files[file_path] == last_modified:
print(f"Skipping already processed file: {file_path}")
return 0
# Determine the file type and process accordingly
if file_path.endswith('.epub'):
#chapters = epub2thtml(file_path)
#if not chapters: # Check if chapters list is empty
# return 0 # Skip further processing
text_content = epub2text(file_path)
elif file_path.endswith('.pdf'):
text_content = pdf2text(file_path)
elif file_path.endswith('.docx'):
text_content = docx_to_markdown(file_path)
else:
print(f"Unsupported file format: {file_path}")
return 0
# Write the extracted text to a file in the output directory
output_file_name = os.path.splitext(os.path.basename(file_path))[0] + '.txt'
output_file_path = os.path.join(output_directory, output_file_name)
with open(output_file_path, 'w', encoding='utf-8') as f:
#f.write('\n'.join(text_content))
f.write(text_content)
print(f"Processed: {file_path}")
processed_files[file_path] = last_modified
return 1
def main():
root_epub_dir = "G:\\My Drive\\My Books\\"
root_txt_output_dir = "G:\\My Drive\\My Books\\output_txt\\"
input_directory = root_epub_dir
output_directory = root_txt_output_dir
if not os.path.exists(output_directory):
os.makedirs(output_directory)
processed_files_path = os.path.join(root_txt_output_dir, 'processed_files.json')
processed_files = read_processed_files_info(processed_files_path)
processed_count = 0
for root, dirs, files in os.walk(input_directory):
if 'output_txt' in dirs:
dirs.remove('output_txt')
for file in files:
file_path = os.path.join(root, file)
print (f"Processing file: {file_path}")
if file.endswith('.epub'):
processed_count += process_file(file_path, output_directory, processed_files)
write_processed_files_info(processed_files_path, processed_files) # Update the JSON after each file
elif file.endswith('.pdf'):
processed_count += process_file(file_path, output_directory, processed_files)
write_processed_files_info(processed_files_path, processed_files) # Update the JSON after each file
elif file.endswith('.docx'):
processed_count += process_file(file_path, output_directory, processed_files)
write_processed_files_info(processed_files_path, processed_files) # Update the JSON after each file
#write_processed_files_info(processed_files_path, processed_files)
print(f"Total processed files: {processed_count}")
if __name__ == "__main__":
main()