1 /*
2  * Branch/Call/Jump (BCJ) filter decoders
3  *
4  * Authors: Lasse Collin <lasse.collin@tukaani.org>
5  *          Igor Pavlov <http://7-zip.org/>
6  *
7  * This file has been put into the public domain.
8  * You can do whatever you want with this file.
9  */
10 
11 #include "xz_private.h"
12 
13 /*
14  * The rest of the file is inside this ifdef. It makes things a little more
15  * convenient when building without support for any BCJ filters.
16  */
17 #ifdef XZ_DEC_BCJ
18 
19 struct xz_dec_bcj {
20 	/* Type of the BCJ filter being used */
21 	enum {
22 		BCJ_X86 = 4,        /* x86 or x86-64 */
23 		BCJ_POWERPC = 5,    /* Big endian only */
24 		BCJ_IA64 = 6,       /* Big or little endian */
25 		BCJ_ARM = 7,        /* Little endian only */
26 		BCJ_ARMTHUMB = 8,   /* Little endian only */
27 		BCJ_SPARC = 9       /* Big or little endian */
28 	} type;
29 
30 	/*
31 	 * Return value of the next filter in the chain. We need to preserve
32 	 * this information across calls, because we must not call the next
33 	 * filter anymore once it has returned XZ_STREAM_END.
34 	 */
35 	enum xz_ret ret;
36 
37 	/* True if we are operating in single-call mode. */
38 	bool single_call;
39 
40 	/*
41 	 * Absolute position relative to the beginning of the uncompressed
42 	 * data (in a single .xz Block). We care only about the lowest 32
43 	 * bits so this doesn't need to be uint64_t even with big files.
44 	 */
45 	uint32_t pos;
46 
47 	/* x86 filter state */
48 	uint32_t x86_prev_mask;
49 
50 	/* Temporary space to hold the variables from struct xz_buf */
51 	uint8_t *out;
52 	size_t out_pos;
53 	size_t out_size;
54 
55 	struct {
56 		/* Amount of already filtered data in the beginning of buf */
57 		size_t filtered;
58 
59 		/* Total amount of data currently stored in buf  */
60 		size_t size;
61 
62 		/*
63 		 * Buffer to hold a mix of filtered and unfiltered data. This
64 		 * needs to be big enough to hold Alignment + 2 * Look-ahead:
65 		 *
66 		 * Type         Alignment   Look-ahead
67 		 * x86              1           4
68 		 * PowerPC          4           0
69 		 * IA-64           16           0
70 		 * ARM              4           0
71 		 * ARM-Thumb        2           2
72 		 * SPARC            4           0
73 		 */
74 		uint8_t buf[16];
75 	} temp;
76 };
77 
78 #ifdef XZ_DEC_X86
79 /*
80  * This is used to test the most significant byte of a memory address
81  * in an x86 instruction.
82  */
bcj_x86_test_msbyte(uint8_t b)83 static inline int bcj_x86_test_msbyte(uint8_t b)
84 {
85 	return b == 0x00 || b == 0xFF;
86 }
87 
bcj_x86(struct xz_dec_bcj * s,uint8_t * buf,size_t size)88 static noinline_for_stack size_t XZ_FUNC bcj_x86(
89 		struct xz_dec_bcj *s, uint8_t *buf, size_t size)
90 {
91 	static const bool mask_to_allowed_status[8]
92 		= { true, true, true, false, true, false, false, false };
93 
94 	static const uint8_t mask_to_bit_num[8] = { 0, 1, 2, 2, 3, 3, 3, 3 };
95 
96 	size_t i;
97 	size_t prev_pos = (size_t)-1;
98 	uint32_t prev_mask = s->x86_prev_mask;
99 	uint32_t src;
100 	uint32_t dest;
101 	uint32_t j;
102 	uint8_t b;
103 
104 	if (size <= 4)
105 		return 0;
106 
107 	size -= 4;
108 	for (i = 0; i < size; ++i) {
109 		if ((buf[i] & 0xFE) != 0xE8)
110 			continue;
111 
112 		prev_pos = i - prev_pos;
113 		if (prev_pos > 3) {
114 			prev_mask = 0;
115 		} else {
116 			prev_mask = (prev_mask << (prev_pos - 1)) & 7;
117 			if (prev_mask != 0) {
118 				b = buf[i + 4 - mask_to_bit_num[prev_mask]];
119 				if (!mask_to_allowed_status[prev_mask]
120 						|| bcj_x86_test_msbyte(b)) {
121 					prev_pos = i;
122 					prev_mask = (prev_mask << 1) | 1;
123 					continue;
124 				}
125 			}
126 		}
127 
128 		prev_pos = i;
129 
130 		if (bcj_x86_test_msbyte(buf[i + 4])) {
131 			src = get_unaligned_le32(buf + i + 1);
132 			while (true) {
133 				dest = src - (s->pos + (uint32_t)i + 5);
134 				if (prev_mask == 0)
135 					break;
136 
137 				j = mask_to_bit_num[prev_mask] * 8;
138 				b = (uint8_t)(dest >> (24 - j));
139 				if (!bcj_x86_test_msbyte(b))
140 					break;
141 
142 				src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
143 			}
144 
145 			dest &= 0x01FFFFFF;
146 			dest |= (uint32_t)0 - (dest & 0x01000000);
147 			put_unaligned_le32(dest, buf + i + 1);
148 			i += 4;
149 		} else {
150 			prev_mask = (prev_mask << 1) | 1;
151 		}
152 	}
153 
154 	prev_pos = i - prev_pos;
155 	s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
156 	return i;
157 }
158 #endif
159 
160 #ifdef XZ_DEC_POWERPC
bcj_powerpc(struct xz_dec_bcj * s,uint8_t * buf,size_t size)161 static noinline_for_stack size_t XZ_FUNC bcj_powerpc(
162 		struct xz_dec_bcj *s, uint8_t *buf, size_t size)
163 {
164 	size_t i;
165 	uint32_t instr;
166 
167 	for (i = 0; i + 4 <= size; i += 4) {
168 		instr = get_unaligned_be32(buf + i);
169 		if ((instr & 0xFC000003) == 0x48000001) {
170 			instr &= 0x03FFFFFC;
171 			instr -= s->pos + (uint32_t)i;
172 			instr &= 0x03FFFFFC;
173 			instr |= 0x48000001;
174 			put_unaligned_be32(instr, buf + i);
175 		}
176 	}
177 
178 	return i;
179 }
180 #endif
181 
182 #ifdef XZ_DEC_IA64
bcj_ia64(struct xz_dec_bcj * s,uint8_t * buf,size_t size)183 static noinline_for_stack size_t XZ_FUNC bcj_ia64(
184 		struct xz_dec_bcj *s, uint8_t *buf, size_t size)
185 {
186 	static const uint8_t branch_table[32] = {
187 		0, 0, 0, 0, 0, 0, 0, 0,
188 		0, 0, 0, 0, 0, 0, 0, 0,
189 		4, 4, 6, 6, 0, 0, 7, 7,
190 		4, 4, 0, 0, 4, 4, 0, 0
191 	};
192 
193 	/*
194 	 * The local variables take a little bit stack space, but it's less
195 	 * than what LZMA2 decoder takes, so it doesn't make sense to reduce
196 	 * stack usage here without doing that for the LZMA2 decoder too.
197 	 */
198 
199 	/* Loop counters */
200 	size_t i;
201 	size_t j;
202 
203 	/* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
204 	uint32_t slot;
205 
206 	/* Bitwise offset of the instruction indicated by slot */
207 	uint32_t bit_pos;
208 
209 	/* bit_pos split into byte and bit parts */
210 	uint32_t byte_pos;
211 	uint32_t bit_res;
212 
213 	/* Address part of an instruction */
214 	uint32_t addr;
215 
216 	/* Mask used to detect which instructions to convert */
217 	uint32_t mask;
218 
219 	/* 41-bit instruction stored somewhere in the lowest 48 bits */
220 	uint64_t instr;
221 
222 	/* Instruction normalized with bit_res for easier manipulation */
223 	uint64_t norm;
224 
225 	for (i = 0; i + 16 <= size; i += 16) {
226 		mask = branch_table[buf[i] & 0x1F];
227 		for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) {
228 			if (((mask >> slot) & 1) == 0)
229 				continue;
230 
231 			byte_pos = bit_pos >> 3;
232 			bit_res = bit_pos & 7;
233 			instr = 0;
234 			for (j = 0; j < 6; ++j)
235 				instr |= (uint64_t)(buf[i + j + byte_pos])
236 						<< (8 * j);
237 
238 			norm = instr >> bit_res;
239 
240 			if (((norm >> 37) & 0x0F) == 0x05
241 					&& ((norm >> 9) & 0x07) == 0) {
242 				addr = (norm >> 13) & 0x0FFFFF;
243 				addr |= ((uint32_t)(norm >> 36) & 1) << 20;
244 				addr <<= 4;
245 				addr -= s->pos + (uint32_t)i;
246 				addr >>= 4;
247 
248 				norm &= ~((uint64_t)0x8FFFFF << 13);
249 				norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
250 				norm |= (uint64_t)(addr & 0x100000)
251 						<< (36 - 20);
252 
253 				instr &= (1 << bit_res) - 1;
254 				instr |= norm << bit_res;
255 
256 				for (j = 0; j < 6; j++)
257 					buf[i + j + byte_pos]
258 						= (uint8_t)(instr >> (8 * j));
259 			}
260 		}
261 	}
262 
263 	return i;
264 }
265 #endif
266 
267 #ifdef XZ_DEC_ARM
bcj_arm(struct xz_dec_bcj * s,uint8_t * buf,size_t size)268 static noinline_for_stack size_t XZ_FUNC bcj_arm(
269 		struct xz_dec_bcj *s, uint8_t *buf, size_t size)
270 {
271 	size_t i;
272 	uint32_t addr;
273 
274 	for (i = 0; i + 4 <= size; i += 4) {
275 		if (buf[i + 3] == 0xEB) {
276 			addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8)
277 					| ((uint32_t)buf[i + 2] << 16);
278 			addr <<= 2;
279 			addr -= s->pos + (uint32_t)i + 8;
280 			addr >>= 2;
281 			buf[i] = (uint8_t)addr;
282 			buf[i + 1] = (uint8_t)(addr >> 8);
283 			buf[i + 2] = (uint8_t)(addr >> 16);
284 		}
285 	}
286 
287 	return i;
288 }
289 #endif
290 
291 #ifdef XZ_DEC_ARMTHUMB
bcj_armthumb(struct xz_dec_bcj * s,uint8_t * buf,size_t size)292 static noinline_for_stack size_t XZ_FUNC bcj_armthumb(
293 		struct xz_dec_bcj *s, uint8_t *buf, size_t size)
294 {
295 	size_t i;
296 	uint32_t addr;
297 
298 	for (i = 0; i + 4 <= size; i += 2) {
299 		if ((buf[i + 1] & 0xF8) == 0xF0
300 				&& (buf[i + 3] & 0xF8) == 0xF8) {
301 			addr = (((uint32_t)buf[i + 1] & 0x07) << 19)
302 					| ((uint32_t)buf[i] << 11)
303 					| (((uint32_t)buf[i + 3] & 0x07) << 8)
304 					| (uint32_t)buf[i + 2];
305 			addr <<= 1;
306 			addr -= s->pos + (uint32_t)i + 4;
307 			addr >>= 1;
308 			buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
309 			buf[i] = (uint8_t)(addr >> 11);
310 			buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
311 			buf[i + 2] = (uint8_t)addr;
312 			i += 2;
313 		}
314 	}
315 
316 	return i;
317 }
318 #endif
319 
320 #ifdef XZ_DEC_SPARC
bcj_sparc(struct xz_dec_bcj * s,uint8_t * buf,size_t size)321 static noinline_for_stack size_t XZ_FUNC bcj_sparc(
322 		struct xz_dec_bcj *s, uint8_t *buf, size_t size)
323 {
324 	size_t i;
325 	uint32_t instr;
326 
327 	for (i = 0; i + 4 <= size; i += 4) {
328 		instr = get_unaligned_be32(buf + i);
329 		if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) {
330 			instr <<= 2;
331 			instr -= s->pos + (uint32_t)i;
332 			instr >>= 2;
333 			instr = ((uint32_t)0x40000000 - (instr & 0x400000))
334 					| 0x40000000 | (instr & 0x3FFFFF);
335 			put_unaligned_be32(instr, buf + i);
336 		}
337 	}
338 
339 	return i;
340 }
341 #endif
342 
343 /*
344  * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
345  * of data that got filtered.
346  *
347  * NOTE: This is implemented as a switch statement to avoid using function
348  * pointers, which could be problematic in the kernel boot code, which must
349  * avoid pointers to static data (at least on x86).
350  */
bcj_apply(struct xz_dec_bcj * s,uint8_t * buf,size_t * pos,size_t size)351 static void XZ_FUNC bcj_apply(struct xz_dec_bcj *s,
352 		uint8_t *buf, size_t *pos, size_t size)
353 {
354 	size_t filtered;
355 
356 	buf += *pos;
357 	size -= *pos;
358 
359 	switch (s->type) {
360 #ifdef XZ_DEC_X86
361 	case BCJ_X86:
362 		filtered = bcj_x86(s, buf, size);
363 		break;
364 #endif
365 #ifdef XZ_DEC_POWERPC
366 	case BCJ_POWERPC:
367 		filtered = bcj_powerpc(s, buf, size);
368 		break;
369 #endif
370 #ifdef XZ_DEC_IA64
371 	case BCJ_IA64:
372 		filtered = bcj_ia64(s, buf, size);
373 		break;
374 #endif
375 #ifdef XZ_DEC_ARM
376 	case BCJ_ARM:
377 		filtered = bcj_arm(s, buf, size);
378 		break;
379 #endif
380 #ifdef XZ_DEC_ARMTHUMB
381 	case BCJ_ARMTHUMB:
382 		filtered = bcj_armthumb(s, buf, size);
383 		break;
384 #endif
385 #ifdef XZ_DEC_SPARC
386 	case BCJ_SPARC:
387 		filtered = bcj_sparc(s, buf, size);
388 		break;
389 #endif
390 	default:
391 		/* Never reached but silence compiler warnings. */
392 		filtered = 0;
393 		break;
394 	}
395 
396 	*pos += filtered;
397 	s->pos += filtered;
398 }
399 
400 /*
401  * Flush pending filtered data from temp to the output buffer.
402  * Move the remaining mixture of possibly filtered and unfiltered
403  * data to the beginning of temp.
404  */
bcj_flush(struct xz_dec_bcj * s,struct xz_buf * b)405 static void XZ_FUNC bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
406 {
407 	size_t copy_size;
408 
409 	copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
410 	memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
411 	b->out_pos += copy_size;
412 
413 	s->temp.filtered -= copy_size;
414 	s->temp.size -= copy_size;
415 	memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
416 }
417 
418 /*
419  * The BCJ filter functions are primitive in sense that they process the
420  * data in chunks of 1-16 bytes. To hide this issue, this function does
421  * some buffering.
422  */
xz_dec_bcj_run(struct xz_dec_bcj * s,struct xz_dec_lzma2 * lzma2,struct xz_buf * b)423 XZ_EXTERN enum xz_ret XZ_FUNC xz_dec_bcj_run(struct xz_dec_bcj *s,
424 		struct xz_dec_lzma2 *lzma2, struct xz_buf *b)
425 {
426 	size_t out_start;
427 
428 	/*
429 	 * Flush pending already filtered data to the output buffer. Return
430 	 * immediatelly if we couldn't flush everything, or if the next
431 	 * filter in the chain had already returned XZ_STREAM_END.
432 	 */
433 	if (s->temp.filtered > 0) {
434 		bcj_flush(s, b);
435 		if (s->temp.filtered > 0)
436 			return XZ_OK;
437 
438 		if (s->ret == XZ_STREAM_END)
439 			return XZ_STREAM_END;
440 	}
441 
442 	/*
443 	 * If we have more output space than what is currently pending in
444 	 * temp, copy the unfiltered data from temp to the output buffer
445 	 * and try to fill the output buffer by decoding more data from the
446 	 * next filter in the chain. Apply the BCJ filter on the new data
447 	 * in the output buffer. If everything cannot be filtered, copy it
448 	 * to temp and rewind the output buffer position accordingly.
449 	 *
450 	 * This needs to be always run when temp.size == 0 to handle a special
451 	 * case where the output buffer is full and the next filter has no
452 	 * more output coming but hasn't returned XZ_STREAM_END yet.
453 	 */
454 	if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0) {
455 		out_start = b->out_pos;
456 		memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
457 		b->out_pos += s->temp.size;
458 
459 		s->ret = xz_dec_lzma2_run(lzma2, b);
460 		if (s->ret != XZ_STREAM_END
461 				&& (s->ret != XZ_OK || s->single_call))
462 			return s->ret;
463 
464 		bcj_apply(s, b->out, &out_start, b->out_pos);
465 
466 		/*
467 		 * As an exception, if the next filter returned XZ_STREAM_END,
468 		 * we can do that too, since the last few bytes that remain
469 		 * unfiltered are meant to remain unfiltered.
470 		 */
471 		if (s->ret == XZ_STREAM_END)
472 			return XZ_STREAM_END;
473 
474 		s->temp.size = b->out_pos - out_start;
475 		b->out_pos -= s->temp.size;
476 		memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);
477 
478 		/*
479 		 * If there wasn't enough input to the next filter to fill
480 		 * the output buffer with unfiltered data, there's no point
481 		 * to try decoding more data to temp.
482 		 */
483 		if (b->out_pos + s->temp.size < b->out_size)
484 			return XZ_OK;
485 	}
486 
487 	/*
488 	 * We have unfiltered data in temp. If the output buffer isn't full
489 	 * yet, try to fill the temp buffer by decoding more data from the
490 	 * next filter. Apply the BCJ filter on temp. Then we hopefully can
491 	 * fill the actual output buffer by copying filtered data from temp.
492 	 * A mix of filtered and unfiltered data may be left in temp; it will
493 	 * be taken care on the next call to this function.
494 	 */
495 	if (b->out_pos < b->out_size) {
496 		/* Make b->out{,_pos,_size} temporarily point to s->temp. */
497 		s->out = b->out;
498 		s->out_pos = b->out_pos;
499 		s->out_size = b->out_size;
500 		b->out = s->temp.buf;
501 		b->out_pos = s->temp.size;
502 		b->out_size = sizeof(s->temp.buf);
503 
504 		s->ret = xz_dec_lzma2_run(lzma2, b);
505 
506 		s->temp.size = b->out_pos;
507 		b->out = s->out;
508 		b->out_pos = s->out_pos;
509 		b->out_size = s->out_size;
510 
511 		if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
512 			return s->ret;
513 
514 		bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);
515 
516 		/*
517 		 * If the next filter returned XZ_STREAM_END, we mark that
518 		 * everything is filtered, since the last unfiltered bytes
519 		 * of the stream are meant to be left as is.
520 		 */
521 		if (s->ret == XZ_STREAM_END)
522 			s->temp.filtered = s->temp.size;
523 
524 		bcj_flush(s, b);
525 		if (s->temp.filtered > 0)
526 			return XZ_OK;
527 	}
528 
529 	return s->ret;
530 }
531 
xz_dec_bcj_create(bool single_call)532 XZ_EXTERN struct xz_dec_bcj * XZ_FUNC xz_dec_bcj_create(bool single_call)
533 {
534 	struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);
535 	if (s != NULL)
536 		s->single_call = single_call;
537 
538 	return s;
539 }
540 
xz_dec_bcj_reset(struct xz_dec_bcj * s,uint8_t id)541 XZ_EXTERN enum xz_ret XZ_FUNC xz_dec_bcj_reset(
542 		struct xz_dec_bcj *s, uint8_t id)
543 {
544 	switch (id) {
545 #ifdef XZ_DEC_X86
546 	case BCJ_X86:
547 #endif
548 #ifdef XZ_DEC_POWERPC
549 	case BCJ_POWERPC:
550 #endif
551 #ifdef XZ_DEC_IA64
552 	case BCJ_IA64:
553 #endif
554 #ifdef XZ_DEC_ARM
555 	case BCJ_ARM:
556 #endif
557 #ifdef XZ_DEC_ARMTHUMB
558 	case BCJ_ARMTHUMB:
559 #endif
560 #ifdef XZ_DEC_SPARC
561 	case BCJ_SPARC:
562 #endif
563 		break;
564 
565 	default:
566 		/* Unsupported Filter ID */
567 		return XZ_OPTIONS_ERROR;
568 	}
569 
570 	s->type = id;
571 	s->ret = XZ_OK;
572 	s->pos = 0;
573 	s->x86_prev_mask = 0;
574 	s->temp.filtered = 0;
575 	s->temp.size = 0;
576 
577 	return XZ_OK;
578 }
579 
580 #endif
581