1 /*
2 * Branch/Call/Jump (BCJ) filter decoders
3 *
4 * Authors: Lasse Collin <lasse.collin@tukaani.org>
5 * Igor Pavlov <http://7-zip.org/>
6 *
7 * This file has been put into the public domain.
8 * You can do whatever you want with this file.
9 */
10
11 #include "xz_private.h"
12
13 /*
14 * The rest of the file is inside this ifdef. It makes things a little more
15 * convenient when building without support for any BCJ filters.
16 */
17 #ifdef XZ_DEC_BCJ
18
19 struct xz_dec_bcj {
20 /* Type of the BCJ filter being used */
21 enum {
22 BCJ_X86 = 4, /* x86 or x86-64 */
23 BCJ_POWERPC = 5, /* Big endian only */
24 BCJ_IA64 = 6, /* Big or little endian */
25 BCJ_ARM = 7, /* Little endian only */
26 BCJ_ARMTHUMB = 8, /* Little endian only */
27 BCJ_SPARC = 9 /* Big or little endian */
28 } type;
29
30 /*
31 * Return value of the next filter in the chain. We need to preserve
32 * this information across calls, because we must not call the next
33 * filter anymore once it has returned XZ_STREAM_END.
34 */
35 enum xz_ret ret;
36
37 /* True if we are operating in single-call mode. */
38 bool single_call;
39
40 /*
41 * Absolute position relative to the beginning of the uncompressed
42 * data (in a single .xz Block). We care only about the lowest 32
43 * bits so this doesn't need to be uint64_t even with big files.
44 */
45 uint32_t pos;
46
47 /* x86 filter state */
48 uint32_t x86_prev_mask;
49
50 /* Temporary space to hold the variables from struct xz_buf */
51 uint8_t *out;
52 size_t out_pos;
53 size_t out_size;
54
55 struct {
56 /* Amount of already filtered data in the beginning of buf */
57 size_t filtered;
58
59 /* Total amount of data currently stored in buf */
60 size_t size;
61
62 /*
63 * Buffer to hold a mix of filtered and unfiltered data. This
64 * needs to be big enough to hold Alignment + 2 * Look-ahead:
65 *
66 * Type Alignment Look-ahead
67 * x86 1 4
68 * PowerPC 4 0
69 * IA-64 16 0
70 * ARM 4 0
71 * ARM-Thumb 2 2
72 * SPARC 4 0
73 */
74 uint8_t buf[16];
75 } temp;
76 };
77
78 #ifdef XZ_DEC_X86
79 /*
80 * This is used to test the most significant byte of a memory address
81 * in an x86 instruction.
82 */
bcj_x86_test_msbyte(uint8_t b)83 static inline int bcj_x86_test_msbyte(uint8_t b)
84 {
85 return b == 0x00 || b == 0xFF;
86 }
87
bcj_x86(struct xz_dec_bcj * s,uint8_t * buf,size_t size)88 static noinline_for_stack size_t XZ_FUNC bcj_x86(
89 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
90 {
91 static const bool mask_to_allowed_status[8]
92 = { true, true, true, false, true, false, false, false };
93
94 static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
95
96 size_t i;
97 size_t prev_pos = (size_t)-1;
98 uint32_t prev_mask = s->x86_prev_mask;
99 uint32_t src;
100 uint32_t dest;
101 uint32_t j;
102 uint8_t b;
103
104 if (size <= 4)
105 return 0;
106
107 size -= 4;
108 for (i = 0; i < size; ++i) {
109 if ((buf[i] & 0xFE) != 0xE8)
110 continue;
111
112 prev_pos = i - prev_pos;
113 if (prev_pos > 3) {
114 prev_mask = 0;
115 } else {
116 prev_mask = (prev_mask << (prev_pos - 1)) & 7;
117 if (prev_mask != 0) {
118 b = buf[i + 4 - mask_to_bit_num[prev_mask]];
119 if (!mask_to_allowed_status[prev_mask]
120 || bcj_x86_test_msbyte(b)) {
121 prev_pos = i;
122 prev_mask = (prev_mask << 1) | 1;
123 continue;
124 }
125 }
126 }
127
128 prev_pos = i;
129
130 if (bcj_x86_test_msbyte(buf[i + 4])) {
131 src = get_unaligned_le32(buf + i + 1);
132 while (true) {
133 dest = src - (s->pos + (uint32_t)i + 5);
134 if (prev_mask == 0)
135 break;
136
137 j = mask_to_bit_num[prev_mask] * 8;
138 b = (uint8_t)(dest >> (24 - j));
139 if (!bcj_x86_test_msbyte(b))
140 break;
141
142 src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
143 }
144
145 dest &= 0x01FFFFFF;
146 dest |= (uint32_t)0 - (dest & 0x01000000);
147 put_unaligned_le32(dest, buf + i + 1);
148 i += 4;
149 } else {
150 prev_mask = (prev_mask << 1) | 1;
151 }
152 }
153
154 prev_pos = i - prev_pos;
155 s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
156 return i;
157 }
158 #endif
159
160 #ifdef XZ_DEC_POWERPC
bcj_powerpc(struct xz_dec_bcj * s,uint8_t * buf,size_t size)161 static noinline_for_stack size_t XZ_FUNC bcj_powerpc(
162 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
163 {
164 size_t i;
165 uint32_t instr;
166
167 for (i = 0; i + 4 <= size; i += 4) {
168 instr = get_unaligned_be32(buf + i);
169 if ((instr & 0xFC000003) == 0x48000001) {
170 instr &= 0x03FFFFFC;
171 instr -= s->pos + (uint32_t)i;
172 instr &= 0x03FFFFFC;
173 instr |= 0x48000001;
174 put_unaligned_be32(instr, buf + i);
175 }
176 }
177
178 return i;
179 }
180 #endif
181
182 #ifdef XZ_DEC_IA64
bcj_ia64(struct xz_dec_bcj * s,uint8_t * buf,size_t size)183 static noinline_for_stack size_t XZ_FUNC bcj_ia64(
184 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
185 {
186 static const uint8_t branch_table[32] = {
187 0, 0, 0, 0, 0, 0, 0, 0,
188 0, 0, 0, 0, 0, 0, 0, 0,
189 4, 4, 6, 6, 0, 0, 7, 7,
190 4, 4, 0, 0, 4, 4, 0, 0
191 };
192
193 /*
194 * The local variables take a little bit stack space, but it's less
195 * than what LZMA2 decoder takes, so it doesn't make sense to reduce
196 * stack usage here without doing that for the LZMA2 decoder too.
197 */
198
199 /* Loop counters */
200 size_t i;
201 size_t j;
202
203 /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
204 uint32_t slot;
205
206 /* Bitwise offset of the instruction indicated by slot */
207 uint32_t bit_pos;
208
209 /* bit_pos split into byte and bit parts */
210 uint32_t byte_pos;
211 uint32_t bit_res;
212
213 /* Address part of an instruction */
214 uint32_t addr;
215
216 /* Mask used to detect which instructions to convert */
217 uint32_t mask;
218
219 /* 41-bit instruction stored somewhere in the lowest 48 bits */
220 uint64_t instr;
221
222 /* Instruction normalized with bit_res for easier manipulation */
223 uint64_t norm;
224
225 for (i = 0; i + 16 <= size; i += 16) {
226 mask = branch_table[buf[i] & 0x1F];
227 for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
228 if (((mask >> slot) & 1) == 0)
229 continue;
230
231 byte_pos = bit_pos >> 3;
232 bit_res = bit_pos & 7;
233 instr = 0;
234 for (j = 0; j < 6; ++j)
235 instr |= (uint64_t)(buf[i + j + byte_pos])
236 << (8 * j);
237
238 norm = instr >> bit_res;
239
240 if (((norm >> 37) & 0x0F) == 0x05
241 && ((norm >> 9) & 0x07) == 0) {
242 addr = (norm >> 13) & 0x0FFFFF;
243 addr |= ((uint32_t)(norm >> 36) & 1) << 20;
244 addr <<= 4;
245 addr -= s->pos + (uint32_t)i;
246 addr >>= 4;
247
248 norm &= ~((uint64_t)0x8FFFFF << 13);
249 norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
250 norm |= (uint64_t)(addr & 0x100000)
251 << (36 - 20);
252
253 instr &= (1 << bit_res) - 1;
254 instr |= norm << bit_res;
255
256 for (j = 0; j < 6; j++)
257 buf[i + j + byte_pos]
258 = (uint8_t)(instr >> (8 * j));
259 }
260 }
261 }
262
263 return i;
264 }
265 #endif
266
267 #ifdef XZ_DEC_ARM
bcj_arm(struct xz_dec_bcj * s,uint8_t * buf,size_t size)268 static noinline_for_stack size_t XZ_FUNC bcj_arm(
269 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
270 {
271 size_t i;
272 uint32_t addr;
273
274 for (i = 0; i + 4 <= size; i += 4) {
275 if (buf[i + 3] == 0xEB) {
276 addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
277 | ((uint32_t)buf[i + 2] << 16);
278 addr <<= 2;
279 addr -= s->pos + (uint32_t)i + 8;
280 addr >>= 2;
281 buf[i] = (uint8_t)addr;
282 buf[i + 1] = (uint8_t)(addr >> 8);
283 buf[i + 2] = (uint8_t)(addr >> 16);
284 }
285 }
286
287 return i;
288 }
289 #endif
290
291 #ifdef XZ_DEC_ARMTHUMB
bcj_armthumb(struct xz_dec_bcj * s,uint8_t * buf,size_t size)292 static noinline_for_stack size_t XZ_FUNC bcj_armthumb(
293 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
294 {
295 size_t i;
296 uint32_t addr;
297
298 for (i = 0; i + 4 <= size; i += 2) {
299 if ((buf[i + 1] & 0xF8) == 0xF0
300 && (buf[i + 3] & 0xF8) == 0xF8) {
301 addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
302 | ((uint32_t)buf[i] << 11)
303 | (((uint32_t)buf[i + 3] & 0x07) << 8)
304 | (uint32_t)buf[i + 2];
305 addr <<= 1;
306 addr -= s->pos + (uint32_t)i + 4;
307 addr >>= 1;
308 buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
309 buf[i] = (uint8_t)(addr >> 11);
310 buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
311 buf[i + 2] = (uint8_t)addr;
312 i += 2;
313 }
314 }
315
316 return i;
317 }
318 #endif
319
320 #ifdef XZ_DEC_SPARC
bcj_sparc(struct xz_dec_bcj * s,uint8_t * buf,size_t size)321 static noinline_for_stack size_t XZ_FUNC bcj_sparc(
322 struct xz_dec_bcj *s, uint8_t *buf, size_t size)
323 {
324 size_t i;
325 uint32_t instr;
326
327 for (i = 0; i + 4 <= size; i += 4) {
328 instr = get_unaligned_be32(buf + i);
329 if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
330 instr <<= 2;
331 instr -= s->pos + (uint32_t)i;
332 instr >>= 2;
333 instr = ((uint32_t)0x40000000 - (instr & 0x400000))
334 | 0x40000000 | (instr & 0x3FFFFF);
335 put_unaligned_be32(instr, buf + i);
336 }
337 }
338
339 return i;
340 }
341 #endif
342
343 /*
344 * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
345 * of data that got filtered.
346 *
347 * NOTE: This is implemented as a switch statement to avoid using function
348 * pointers, which could be problematic in the kernel boot code, which must
349 * avoid pointers to static data (at least on x86).
350 */
bcj_apply(struct xz_dec_bcj * s,uint8_t * buf,size_t * pos,size_t size)351 static void XZ_FUNC bcj_apply(struct xz_dec_bcj *s,
352 uint8_t *buf, size_t *pos, size_t size)
353 {
354 size_t filtered;
355
356 buf += *pos;
357 size -= *pos;
358
359 switch (s->type) {
360 #ifdef XZ_DEC_X86
361 case BCJ_X86:
362 filtered = bcj_x86(s, buf, size);
363 break;
364 #endif
365 #ifdef XZ_DEC_POWERPC
366 case BCJ_POWERPC:
367 filtered = bcj_powerpc(s, buf, size);
368 break;
369 #endif
370 #ifdef XZ_DEC_IA64
371 case BCJ_IA64:
372 filtered = bcj_ia64(s, buf, size);
373 break;
374 #endif
375 #ifdef XZ_DEC_ARM
376 case BCJ_ARM:
377 filtered = bcj_arm(s, buf, size);
378 break;
379 #endif
380 #ifdef XZ_DEC_ARMTHUMB
381 case BCJ_ARMTHUMB:
382 filtered = bcj_armthumb(s, buf, size);
383 break;
384 #endif
385 #ifdef XZ_DEC_SPARC
386 case BCJ_SPARC:
387 filtered = bcj_sparc(s, buf, size);
388 break;
389 #endif
390 default:
391 /* Never reached but silence compiler warnings. */
392 filtered = 0;
393 break;
394 }
395
396 *pos += filtered;
397 s->pos += filtered;
398 }
399
400 /*
401 * Flush pending filtered data from temp to the output buffer.
402 * Move the remaining mixture of possibly filtered and unfiltered
403 * data to the beginning of temp.
404 */
bcj_flush(struct xz_dec_bcj * s,struct xz_buf * b)405 static void XZ_FUNC bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
406 {
407 size_t copy_size;
408
409 copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
410 memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
411 b->out_pos += copy_size;
412
413 s->temp.filtered -= copy_size;
414 s->temp.size -= copy_size;
415 memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
416 }
417
418 /*
419 * The BCJ filter functions are primitive in sense that they process the
420 * data in chunks of 1-16 bytes. To hide this issue, this function does
421 * some buffering.
422 */
xz_dec_bcj_run(struct xz_dec_bcj * s,struct xz_dec_lzma2 * lzma2,struct xz_buf * b)423 XZ_EXTERN enum xz_ret XZ_FUNC xz_dec_bcj_run(struct xz_dec_bcj *s,
424 struct xz_dec_lzma2 *lzma2, struct xz_buf *b)
425 {
426 size_t out_start;
427
428 /*
429 * Flush pending already filtered data to the output buffer. Return
430 * immediatelly if we couldn't flush everything, or if the next
431 * filter in the chain had already returned XZ_STREAM_END.
432 */
433 if (s->temp.filtered > 0) {
434 bcj_flush(s, b);
435 if (s->temp.filtered > 0)
436 return XZ_OK;
437
438 if (s->ret == XZ_STREAM_END)
439 return XZ_STREAM_END;
440 }
441
442 /*
443 * If we have more output space than what is currently pending in
444 * temp, copy the unfiltered data from temp to the output buffer
445 * and try to fill the output buffer by decoding more data from the
446 * next filter in the chain. Apply the BCJ filter on the new data
447 * in the output buffer. If everything cannot be filtered, copy it
448 * to temp and rewind the output buffer position accordingly.
449 *
450 * This needs to be always run when temp.size == 0 to handle a special
451 * case where the output buffer is full and the next filter has no
452 * more output coming but hasn't returned XZ_STREAM_END yet.
453 */
454 if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) {
455 out_start = b->out_pos;
456 memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
457 b->out_pos += s->temp.size;
458
459 s->ret = xz_dec_lzma2_run(lzma2, b);
460 if (s->ret != XZ_STREAM_END
461 && (s->ret != XZ_OK || s->single_call))
462 return s->ret;
463
464 bcj_apply(s, b->out, &out_start, b->out_pos);
465
466 /*
467 * As an exception, if the next filter returned XZ_STREAM_END,
468 * we can do that too, since the last few bytes that remain
469 * unfiltered are meant to remain unfiltered.
470 */
471 if (s->ret == XZ_STREAM_END)
472 return XZ_STREAM_END;
473
474 s->temp.size = b->out_pos - out_start;
475 b->out_pos -= s->temp.size;
476 memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
477
478 /*
479 * If there wasn't enough input to the next filter to fill
480 * the output buffer with unfiltered data, there's no point
481 * to try decoding more data to temp.
482 */
483 if (b->out_pos + s->temp.size < b->out_size)
484 return XZ_OK;
485 }
486
487 /*
488 * We have unfiltered data in temp. If the output buffer isn't full
489 * yet, try to fill the temp buffer by decoding more data from the
490 * next filter. Apply the BCJ filter on temp. Then we hopefully can
491 * fill the actual output buffer by copying filtered data from temp.
492 * A mix of filtered and unfiltered data may be left in temp; it will
493 * be taken care on the next call to this function.
494 */
495 if (b->out_pos < b->out_size) {
496 /* Make b->out{,_pos,_size} temporarily point to s->temp. */
497 s->out = b->out;
498 s->out_pos = b->out_pos;
499 s->out_size = b->out_size;
500 b->out = s->temp.buf;
501 b->out_pos = s->temp.size;
502 b->out_size = sizeof(s->temp.buf);
503
504 s->ret = xz_dec_lzma2_run(lzma2, b);
505
506 s->temp.size = b->out_pos;
507 b->out = s->out;
508 b->out_pos = s->out_pos;
509 b->out_size = s->out_size;
510
511 if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
512 return s->ret;
513
514 bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
515
516 /*
517 * If the next filter returned XZ_STREAM_END, we mark that
518 * everything is filtered, since the last unfiltered bytes
519 * of the stream are meant to be left as is.
520 */
521 if (s->ret == XZ_STREAM_END)
522 s->temp.filtered = s->temp.size;
523
524 bcj_flush(s, b);
525 if (s->temp.filtered > 0)
526 return XZ_OK;
527 }
528
529 return s->ret;
530 }
531
xz_dec_bcj_create(bool single_call)532 XZ_EXTERN struct xz_dec_bcj * XZ_FUNC xz_dec_bcj_create(bool single_call)
533 {
534 struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);
535 if (s != NULL)
536 s->single_call = single_call;
537
538 return s;
539 }
540
xz_dec_bcj_reset(struct xz_dec_bcj * s,uint8_t id)541 XZ_EXTERN enum xz_ret XZ_FUNC xz_dec_bcj_reset(
542 struct xz_dec_bcj *s, uint8_t id)
543 {
544 switch (id) {
545 #ifdef XZ_DEC_X86
546 case BCJ_X86:
547 #endif
548 #ifdef XZ_DEC_POWERPC
549 case BCJ_POWERPC:
550 #endif
551 #ifdef XZ_DEC_IA64
552 case BCJ_IA64:
553 #endif
554 #ifdef XZ_DEC_ARM
555 case BCJ_ARM:
556 #endif
557 #ifdef XZ_DEC_ARMTHUMB
558 case BCJ_ARMTHUMB:
559 #endif
560 #ifdef XZ_DEC_SPARC
561 case BCJ_SPARC:
562 #endif
563 break;
564
565 default:
566 /* Unsupported Filter ID */
567 return XZ_OPTIONS_ERROR;
568 }
569
570 s->type = id;
571 s->ret = XZ_OK;
572 s->pos = 0;
573 s->x86_prev_mask = 0;
574 s->temp.filtered = 0;
575 s->temp.size = 0;
576
577 return XZ_OK;
578 }
579
580 #endif
581