-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmobimergehdimage.py
225 lines (179 loc) · 9.48 KB
/
mobimergehdimage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import struct, imghdr, os, sys
#################################################################################
# #
# MOBI HDImage Merger #
# #
# 2019 Choryu Park (Kyoungkyu Park) #
# #
#################################################################################
# This source code is based on these documents, and source codes;
# https://wiki.mobileread.com/wiki/MOBI
# https://wiki.mobileread.com/wiki/PDB
# https://www.mobileread.com/forums/showpost.php?p=3114050&postcount=1145
#
# Thanks to author of 'DumpAZW6'. It many helps to write this source code.
#################################################################################
# * Important; In this source code, First record is 0. not 1.
# So, Last record is Record count - 1.
#
# AZW.RES (HDContainer) Structure ->
# HDContainer is including HD images. each images have own record, and it's Starts with 'CRES'.
# Images is following main book's image ordering, but not include thumbnail.
# and if image not need to HD resolution (like logo, special character), that record marked as 'Placeholder' for matching image count.
#
# -------------------------------
# | Book | HDContainer | Starts with |
# |-------------|-------------- |
# | Image 1 (SD)| HDImage 1 | <- b'CRES'
# | Image 2 | Placeholder | <- b'\xA0\xA0\xA0\xA0'
# | Image 3 (SD)| HDImage 3 | <- b'CRES'
# | Image 4 (SD)| HDImage 4 | <- b'CRES'
# | Thumbnail | *Not exist |
# -------------------------------
# * This structure is sample. Originally, there is no index number.
## Header offsets for Book file
# 2-byte unsigned short
BHDR_OFFSET_NUM_OF_RECORD = 76
# 4-byte unsigned long
BHDR_OFFSET_FILE_IDENT = 60
BHDR_RECORD0_OFFSET_TEXT_ENCODING = 28
BHDR_RECORD0_OFFSET_FULL_NAME_OFFSET = 84
BHDR_RECORD0_OFFSET_FULL_NAME_LENGTH = 88
BHDR_RECORD0_OFFSET_FIRST_IMAGE_INDEX = 108
# 8-byte Record info structure
# - [record Data Offset] 4-byte unsinged long
# - [unused] 4-byte
BHDR_OFFSET_RECORD_INFO_LIST = 78
## Header offsets for HDContainer file
CHDR_OFFSET_FILE_IDENT = 60
CHDR_RECORD0_OFFSET_TEXT_ENCODING = 12
CHDR_RECORD0_OFFSET_FULL_NAME_OFFSET = 40
CHDR_RECORD0_OFFSET_FULL_NAME_LENGTH = 44
TEXT_ENCODING_MAP = {
65001 : "UTF-8",
1252 : "windows-1252"
}
CONTAINER_CONTENT_TYPE_IMAGE = "IMAGE"
CONTAINER_CONTENT_TYPE_PLACEHOLDER = "PLACEHOLDER"
CONTAINER_NEEDED_TYPES = {
b"CRES" : CONTAINER_CONTENT_TYPE_IMAGE,
b"\xa0\xa0\xa0\xa0" : CONTAINER_CONTENT_TYPE_PLACEHOLDER
}
# Actually, this function is grabbed from 'DumpAZW6'. That script can getting from URL where in header of this code.
def get_image_type(imgdata):
imgtype = imghdr.what(None, imgdata)
# horrible hack since imghdr detects jxr/wdp as tiffs
if imgtype is not None and imgtype == "tiff":
imgtype = "wdp"
# imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some
# with only the magic JPEG bytes out there...
# ImageMagick handles those, so, do it too.
if imgtype is None:
if imgdata[0:2] == b'\xFF\xD8':
# Get last non-null bytes
last = len(imgdata)
while (imgdata[last-1:last] == b'\x00'):
last-=1
# Be extra safe, check the trailing bytes, too.
if imgdata[last-2:last] == b'\xFF\xD9':
imgtype = "jpeg"
return imgtype
def get_charset(data, offset):
charset_id, = struct.unpack_from(">L", data, offset)
if charset_id in TEXT_ENCODING_MAP:
return TEXT_ENCODING_MAP[charset_id]
else:
print("Unknown Charset %d" % charset_id)
raise
def get_book_title(data, base_offset, name_offset, length_offset, charset):
name_offset, = struct.unpack_from(">L", data, base_offset + name_offset)
name_length, = struct.unpack_from(">L", data, base_offset + length_offset)
name_offset += base_offset
return data[name_offset:name_offset + name_length].decode(charset)
class MobiMergeHDImage:
hdimage_dict = None
def __init__(self, mobi_data):
self.mobi = bytearray(mobi_data)
self.record_dict = self.get_record_dict(self.mobi)
if self.mobi[BHDR_OFFSET_FILE_IDENT:BHDR_OFFSET_FILE_IDENT+8] != b'BOOKMOBI':
print("This eBook is not a Mobi.")
raise
self.charset = get_charset(self.mobi, self.record_dict[0]["OFFSET"] + BHDR_RECORD0_OFFSET_TEXT_ENCODING)
self.book_title = get_book_title(self.mobi, self.record_dict[0]["OFFSET"], BHDR_RECORD0_OFFSET_FULL_NAME_OFFSET, BHDR_RECORD0_OFFSET_FULL_NAME_LENGTH, self.charset)
def get_record_dict(self, data):
current_offset = BHDR_OFFSET_NUM_OF_RECORD
record_count, = struct.unpack_from(">H", data, current_offset)
record_dict = dict()
record_dict[sys.maxsize] = record_count
for index in range(0, record_count):
record_dict[index] = dict()
offset = BHDR_OFFSET_RECORD_INFO_LIST + index * 8
record_dict[index]["INFO_OFFSET"] = offset
record_dict[index]["OFFSET"], = struct.unpack_from(">L", data, offset)
return record_dict
def load_azwres(self, res_file):
with open(res_file, 'rb') as azwresfile:
azwres = azwresfile.read()
if azwres[CHDR_OFFSET_FILE_IDENT:CHDR_OFFSET_FILE_IDENT + 8] != b'RBINCONT':
print("%s is not a HDImage container." % os.path.basename(res_file))
raise
azwres_dict = self.get_record_dict(azwres)
azwres_title = get_book_title(azwres, azwres_dict[0]["OFFSET"], CHDR_RECORD0_OFFSET_FULL_NAME_OFFSET, CHDR_RECORD0_OFFSET_FULL_NAME_LENGTH, self.charset)
if self.book_title != azwres_title:
print("Book mismatch. Book title and HDImage container title is not same. ")
print("Book: %s, Container: %s" % (self.book_title, azwres_title))
raise
self.hdimage_dict = dict()
image_index = 0
for index, val in sorted(azwres_dict.items()):
if index != sys.maxsize and azwres[val["OFFSET"]:val["OFFSET"] + 2] != b"\xe9\x8e":
if azwres[val["OFFSET"]:val["OFFSET"] + 4] in CONTAINER_NEEDED_TYPES:
self.hdimage_dict[image_index] = dict()
self.hdimage_dict[image_index]["INDEX"] = index
self.hdimage_dict[image_index]["TYPE"] = CONTAINER_NEEDED_TYPES[azwres[val["OFFSET"]:val["OFFSET"] + 4]]
if self.hdimage_dict[image_index]["TYPE"] == CONTAINER_CONTENT_TYPE_IMAGE:
self.hdimage_dict[image_index]["CONTENT"] = azwres[val["OFFSET"] + 12:azwres_dict[index + 1]["OFFSET"]]
image_index += 1
def record_offset_update(self, record_index, modified_offset_size):
for target_record_index in range(record_index + 1, self.record_dict[sys.maxsize]):
newoffset = self.record_dict[target_record_index]["OFFSET"] + modified_offset_size
self.record_dict[target_record_index]["OFFSET"] = newoffset
infooffset = self.record_dict[target_record_index]["INFO_OFFSET"]
self.mobi[infooffset:infooffset + 4] = struct.pack(">L", newoffset)
def merge(self):
if self.hdimage_dict is None:
print("'azw.res' file is not loaded yet.")
raise
first_record_offset = self.record_dict[0]['OFFSET']
# Find original book's image
first_image_index, = struct.unpack_from(">L", self.mobi, first_record_offset + 108)
images_dict = dict()
image_index = 0
for record_index in range(first_image_index, self.record_dict[sys.maxsize]):
if record_index + 1 == self.record_dict[sys.maxsize]:
img = self.mobi[self.record_dict[record_index]["OFFSET"]:]
else:
img = self.mobi[self.record_dict[record_index]["OFFSET"]:self.record_dict[record_index + 1]["OFFSET"]]
if get_image_type(img) is not None:
images_dict[image_index] = dict()
images_dict[image_index]["INDEX"] = record_index
images_dict[image_index]["CONTENT"] = img
image_index += 1
# Merge
for index, hdimage in list(self.hdimage_dict.items()):
if hdimage["TYPE"] == CONTAINER_CONTENT_TYPE_IMAGE:
# If HDImage type is IMAGE, do merge.
if self.mobi.find(images_dict[index]["CONTENT"]) != -1:
size_difference = len(hdimage["CONTENT"]) - len(images_dict[index]["CONTENT"])
original_index = images_dict[index]["INDEX"]
if original_index + 1 == self.record_dict[sys.maxsize]:
self.mobi[self.record_dict[original_index]["OFFSET"]:] = hdimage["CONTENT"]
else:
self.mobi[self.record_dict[original_index]["OFFSET"]:self.record_dict[original_index + 1]["OFFSET"]] = hdimage["CONTENT"]
self.record_offset_update(images_dict[index]["INDEX"], size_difference)
else:
# Skip if it's not a image (Maybe PLACEHOLDER)
pass
return self.mobi