-
Notifications
You must be signed in to change notification settings - Fork 23
/
kl4e.c
346 lines (337 loc) · 13.5 KB
/
kl4e.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
/*
* KL4E decompression code, reverse engineered from the 6.60 firmware by artart78.
* Original function: UtilsForKernel_6C6887EE from sysmem.prx.
*
* The original code is most likely written directly on assembly and quite obfuscated (which might just be
* a consequence of heavy optimization). This code is a bit simplified on some points (replaced inlined copies
* with memcpy, dropped optimized word-by-word copy, dropped a cache instruction) but should be functionally equivalent.
* It was split into several functions for readability purposes, but corresponds to a single function.
*
* KL3E's assembly code is fairly different from KL4E's, but it was found out that just changing one constant made it
* work on KL3E. There may be corner cases which are not handled since KL3E's function (sub_00000000 in loadexec)
* was not reversed, and there are very likely files which it will decompress "successfully" although they should
* give an error (for example, for copy-only not-compressed files).
*
* Many thanks to BenHur (see libLZR.h) and tpunix (see https://github.com/tpunix/kirk_engine/blob/master/npdpc/tlzrc.c)
* for providing code of variants (2RLZ and LZRC?) which helped me a lot understand how the stuff works in general.
*/
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
#define max(a, b) ((a) < (b) ? (b) : (a))
/*
* KL4E's behavior seems to be very similar to LZMA.
*
* It is based on two things:
* 1) the LZ77 algorithm
* 2) arithmetic coding.
*
* The first point is just the idea that, in order to compress a stream using repetitions, you can either:
* - output a raw byte
* - repeat a sequence of previous raw bytes.
* In the base LZ77 algorithm, the main loop first reads a bit, then:
* - if the bit is a 0, read the next 8 bits and output the byte which corresponds (called a literal),
* - if the bit is a 1, read some distance m and length n which can be encoded in various ways, and copy
* the n bytes which were output m bytes earlier.
*
* Here, instead of encoding bits directly, KL4E uses some kind of arithmetic coding. The general idea is
* that you read 4 bytes, then if you know a 1 will happen with probability p, you consider the output
* is a 1 if the 4 bytes are in the [0, 2^32 * p] interval, and a 0 if it's in the [2 ^ 32 * p, 2 ^ 32]
* interval. You then repeat the operation by taking the subinterval in which the value is, and read another
* byte when the interval became smaller than 2^24. The probabilities are also updated each time, by decaying
* (they're multiplied by 7/8 or 15/16 each time) and receiving an additional 31 or 15 if the output was indeed a 1.
*
* Compared to 2RLZ, here all the probabilities are set to a single value given by the header (instead of
* just being 0x80 ie 1/2 by default), and a literal is output directly without checking if the first bit is 0.
*
* The rest is just up to how you encode distance and length codes, which is where the difference between KL3E
* and KL4E lies (KL4E seems to be able to have bigger maximum "distance" codes), and also what probabilities
* you consider (for example, Sony uses different probabilities depending on the file's offset modulo 8).
*/
/*
* Read one bit using arithmetic coding, with a given (updated) probability and its associated decay/bonus.
*/
int read_bit(u32 *inputVal, u32 *range, u8 *probPtr, u8 **inBuf, u32 decay, u32 bonus)
{
u32 bound;
u8 prob = *probPtr;
if ((*range >> 24) == 0) {
*inputVal = (*inputVal << 8) + *((*inBuf)++);
bound = *range * prob;
*range <<= 8;
} else {
bound = (*range >> 8) * prob;
}
prob -= (prob >> decay);
if (*inputVal >= bound) {
*inputVal -= bound;
*range -= bound;
*probPtr = prob;
return 0;
} else {
*range = bound;
*probPtr = prob + bonus;
return 1;
}
}
/*
* Same as above, but with balanced probability 1/2.
*/
int read_bit_uniform(u32 *inputVal, u32 *range, u8 **inBuf)
{
if (*range >> 24 == 0) {
*inputVal = (*inputVal << 8) + *((*inBuf)++);
*range = *range << 7;
} else {
*range = *range >> 1;
}
if (*inputVal >= *range) {
*inputVal -= *range;
return 0;
} else {
return 1;
}
}
/*
* Same as above, but without normalizing the range.
*/
int read_bit_uniform_nonormal(u32 *inputVal, u32 *range)
{
*range >>= 1;
if (*inputVal >= *range) {
*inputVal -= *range;
return 0;
} else {
return 1;
}
}
/*
* Output a raw byte by reading 8 bits using arithmetic coding.
*/
void output_raw(u32 *inputVal, u32 *range, u8 *probs, u8 **inBuf, u32 *curByte, u8 *curOut, u8 shift)
{
u32 mask = (((size_t)curOut & 7) << 8) | (*curByte & 0xFF);
u8 *curProbs = &probs[((mask >> shift) & 7) * 255] - 1;
*curByte = 1;
while (*curByte < 0x100) {
u8 *curProb = &curProbs[*curByte];
*curByte <<= 1;
if (read_bit(inputVal, range, curProb, inBuf, 3, 31)) {
*curByte |= 1;
}
}
*curOut = *curByte & 0xff;
}
int decompress_kle(u8 *outBuf, int outSize, u8 *inBuf, void **end, int isKl4e)
{
u8 litProbs[2040];
u8 copyDistBitsProbs[304];
u8 copyDistProbs[144];
u8 copyCountBitsProbs[64];
u8 copyCountProbs[256];
u8 *outEnd = outBuf + outSize;
u8 *curOut = outBuf;
u32 curByte = 0;
u32 range = 0xffffffff;
u32 copyDist, copyCount;
u8 *curCopyDistBitsProbs;
u32 inputVal = (inBuf[1] << 24) | (inBuf[2] << 16) | (inBuf[3] << 8) | inBuf[4];
// Handle the direct copy case (if the file is actually not compressed).
if (inBuf[0] & 0x80) {
inBuf += 5;
u8 *dataEnd = outBuf + inputVal;
if (dataEnd >= outEnd) {
return 0x80000104; // SCE_ERROR_INVALID_SIZE
}
while (curOut < dataEnd) {
*(curOut++) = *(inBuf++);
}
inBuf--;
if (end != NULL) {
*end = inBuf;
}
return curOut - outBuf;
}
// Initialize probabilities from the header value.
u8 byte = 128 - (((inBuf[0] >> 3) & 3) << 4);
memset(litProbs, byte, sizeof(litProbs));
memset(copyCountBitsProbs, byte, sizeof(copyCountBitsProbs));
memset(copyDistBitsProbs, byte, sizeof(copyDistBitsProbs));
memset(copyCountProbs, byte, sizeof(copyCountProbs));
memset(copyDistProbs, byte, sizeof(copyDistProbs));
u8 *curCopyCountBitsProbs = copyCountBitsProbs;
/* Shift used to determine if the probabilities should be determined more by the
* output's byte alignment or by the previous byte. */
u8 shift = inBuf[0] & 0x7;
inBuf += 5;
// Read a literal directly.
output_raw(&inputVal, &range, litProbs, &inBuf, &curByte, curOut, shift);
while (1) {
curOut++;
// If we read a 0, read a literal.
if (read_bit(&inputVal, &range, curCopyCountBitsProbs, &inBuf, 4, 15) == 0) {
curCopyCountBitsProbs = max(curCopyCountBitsProbs - 1, copyCountBitsProbs);
if (curOut == outEnd) {
return 0x80000104; // SCE_ERROR_INVALID_SIZE
}
output_raw(&inputVal, &range, litProbs, &inBuf, &curByte, curOut, shift);
continue;
}
// Otherwise, first find the number of bits used in the 'length' code.
copyCount = 1;
s32 copyCountBits = -1;
while (copyCountBits < 6) {
curCopyCountBitsProbs += 8;
if (!read_bit(&inputVal, &range, curCopyCountBitsProbs, &inBuf, 4, 15)) {
break;
}
copyCountBits++;
}
// Determine the length itself, and use different distance code probabilities depending on it (and on whether it's KL3E or KL4E).
s32 powLimit;
if (copyCountBits >= 0) {
u8 *probs = ©CountProbs[(copyCountBits << 5) | ((((size_t)curOut & 3) << (copyCountBits + 3)) & 0x18) | ((size_t)curCopyCountBitsProbs & 7)];
if (copyCountBits < 3) {
copyCount = 1;
} else {
copyCount = 2 + read_bit(&inputVal, &range, probs + 24, &inBuf, 3, 31);
if (copyCountBits > 3) {
copyCount = (copyCount << 1) | read_bit(&inputVal, &range, probs + 24, &inBuf, 3, 31);
if (copyCountBits > 4) {
copyCount = (copyCount << 1) | read_bit_uniform(&inputVal, &range, &inBuf);
}
for (u32 i = 5; i < copyCountBits; i++) {
copyCount = (copyCount << 1) | read_bit_uniform_nonormal(&inputVal, &range);
}
}
}
copyCount = copyCount << 1;
if (read_bit(&inputVal, &range, probs, &inBuf, 3, 31)) {
copyCount |= 1;
if (copyCountBits <= 0) {
powLimit = isKl4e ? 256 : 128;
curCopyDistBitsProbs = ©DistBitsProbs[56 + copyCountBits];
}
} else {
if (copyCountBits <= 0) {
powLimit = 64;
curCopyDistBitsProbs = ©DistBitsProbs[copyCountBits];
}
}
if (copyCountBits > 0) {
copyCount = (copyCount << 1) | read_bit(&inputVal, &range, probs + 8, &inBuf, 3, 31);
if (copyCountBits != 1) {
copyCount = copyCount << 1;
if (read_bit(&inputVal, &range, probs + 16, &inBuf, 3, 31)) {
copyCount = copyCount + 1;
if (copyCount == 0xFF) {
if (end != NULL) {
*end = inBuf;
}
return curOut - outBuf;
}
}
}
curCopyDistBitsProbs = ©DistBitsProbs[56 + copyCountBits];
powLimit = isKl4e ? 256 : 128;
}
} else {
powLimit = 64;
curCopyDistBitsProbs = ©DistBitsProbs[copyCountBits];
}
// Find out the number of bits used for distance codes.
s32 curPow = 8;
int skip = 0;
s32 copyDistBits;
while (1) {
u8 *curProb = curCopyDistBitsProbs + (curPow - 7);
curPow <<= 1;
copyDistBits = curPow - powLimit;
if (!read_bit(&inputVal, &range, curProb, &inBuf, 3, 31)) {
if (copyDistBits >= 0) {
if (copyDistBits != 0) {
copyDistBits -= 8;
break;
}
copyDist = 0;
if (curCopyDistBitsProbs == curOut) { // Is this a mistake by Sony?
return 0x80000108; // SCE_ERROR_INVALID_FORMAT
}
skip = 1; // Just copy with a zero distance.
break;
}
} else {
curPow += 8;
if (copyDistBits >= 0)
break;
}
}
if (!skip) {
// Find out the distance itself.
u8 *curProbs = ©DistProbs[copyDistBits];
s32 readBits = copyDistBits / 8;
if (readBits < 3) {
copyDist = 1;
} else {
copyDist = 2 + read_bit(&inputVal, &range, curProbs + 3, &inBuf, 3, 31);
if (readBits > 3) {
copyDist = (copyDist << 1) | read_bit(&inputVal, &range, curProbs + 3, &inBuf, 3, 31);
if (readBits > 4) {
copyDist = (copyDist << 1) | read_bit_uniform(&inputVal, &range, &inBuf);
readBits--;
}
while (readBits > 4) {
copyDist = copyDist << 1;
copyDist += read_bit_uniform_nonormal(&inputVal, &range);
readBits--;
}
}
}
copyDist = copyDist << 1;
if (read_bit(&inputVal, &range, curProbs, &inBuf, 3, 31)) {
if (readBits > 0) {
copyDist = copyDist + 1;
}
} else {
if (readBits <= 0) {
copyDist = copyDist - 1;
}
}
if (readBits > 0) {
copyDist = copyDist << 1;
if (read_bit(&inputVal, &range, curProbs + 1, &inBuf, 3, 31)) {
if (readBits != 1) {
copyDist = copyDist + 1;
}
} else {
if (readBits == 1) {
copyDist = copyDist - 1;
}
}
if (readBits != 1) {
copyDist = copyDist << 1;
if (!read_bit(&inputVal, &range, curProbs + 2, &inBuf, 3, 31)) {
copyDist = copyDist - 1;
}
}
}
if (copyDist >= curOut - outBuf) {
return 0x80000108; // SCE_ERROR_INVALID_FORMAT
}
}
// Copy the bytes with the given count and distance
for (u32 i = 0; i < copyCount + 1; i++) {
curOut[i] = (curOut - copyDist - 1)[i];
}
curByte = curOut[copyCount];
curOut += copyCount;
curCopyCountBitsProbs = ©CountBitsProbs[6 + ((size_t)curOut & 1)];
}
}