1 /*
2  * bzip2 is written by Julian Seward <jseward@bzip.org>.
3  * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
4  * See README and LICENSE files in this directory for more information.
5  */
6 
7 /*-------------------------------------------------------------*/
8 /*--- Block sorting machinery                               ---*/
9 /*---                                           blocksort.c ---*/
10 /*-------------------------------------------------------------*/
11 
12 /* ------------------------------------------------------------------
13 This file is part of bzip2/libbzip2, a program and library for
14 lossless, block-sorting data compression.
15 
16 bzip2/libbzip2 version 1.0.4 of 20 December 2006
17 Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
18 
19 Please read the WARNING, DISCLAIMER and PATENTS sections in the
20 README file.
21 
22 This program is released under the terms of the license contained
23 in the file LICENSE.
24 ------------------------------------------------------------------ */
25 
26 /* #include "bzlib_private.h" */
27 
28 #define mswap(zz1, zz2) \
29 { \
30 	int32_t zztmp = zz1; \
31 	zz1 = zz2; \
32 	zz2 = zztmp; \
33 }
34 
35 static
36 /* No measurable speed gain with inlining */
37 /* ALWAYS_INLINE */
mvswap(uint32_t * ptr,int32_t zzp1,int32_t zzp2,int32_t zzn)38 void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
39 {
40 	while (zzn > 0) {
41 		mswap(ptr[zzp1], ptr[zzp2]);
42 		zzp1++;
43 		zzp2++;
44 		zzn--;
45 	}
46 }
47 
48 static
49 ALWAYS_INLINE
mmin(int32_t a,int32_t b)50 int32_t mmin(int32_t a, int32_t b)
51 {
52 	return (a < b) ? a : b;
53 }
54 
55 
56 /*---------------------------------------------*/
57 /*--- Fallback O(N log(N)^2) sorting        ---*/
58 /*--- algorithm, for repetitive blocks      ---*/
59 /*---------------------------------------------*/
60 
61 /*---------------------------------------------*/
62 static
63 inline
fallbackSimpleSort(uint32_t * fmap,uint32_t * eclass,int32_t lo,int32_t hi)64 void fallbackSimpleSort(uint32_t* fmap,
65 		uint32_t* eclass,
66 		int32_t   lo,
67 		int32_t   hi)
68 {
69 	int32_t i, j, tmp;
70 	uint32_t ec_tmp;
71 
72 	if (lo == hi) return;
73 
74 	if (hi - lo > 3) {
75 		for (i = hi-4; i >= lo; i--) {
76 			tmp = fmap[i];
77 			ec_tmp = eclass[tmp];
78 			for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
79 				fmap[j-4] = fmap[j];
80 			fmap[j-4] = tmp;
81 		}
82 	}
83 
84 	for (i = hi-1; i >= lo; i--) {
85 		tmp = fmap[i];
86 		ec_tmp = eclass[tmp];
87 		for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
88 			fmap[j-1] = fmap[j];
89 		fmap[j-1] = tmp;
90 	}
91 }
92 
93 
94 /*---------------------------------------------*/
95 #define fpush(lz,hz) { \
96 	stackLo[sp] = lz; \
97 	stackHi[sp] = hz; \
98 	sp++; \
99 }
100 
101 #define fpop(lz,hz) { \
102 	sp--; \
103 	lz = stackLo[sp]; \
104 	hz = stackHi[sp]; \
105 }
106 
107 #define FALLBACK_QSORT_SMALL_THRESH 10
108 #define FALLBACK_QSORT_STACK_SIZE   100
109 
110 static NOINLINE
fallbackQSort3(uint32_t * fmap,uint32_t * eclass,int32_t loSt,int32_t hiSt)111 void fallbackQSort3(uint32_t* fmap,
112 		uint32_t* eclass,
113 		int32_t   loSt,
114 		int32_t   hiSt)
115 {
116 	int32_t sp;
117 	uint32_t r;
118 	int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
119 	int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
120 
121 	r = 0;
122 
123 	sp = 0;
124 	fpush(loSt, hiSt);
125 
126 	while (sp > 0) {
127 		int32_t unLo, unHi, ltLo, gtHi, n, m;
128 		int32_t lo, hi;
129 		uint32_t med;
130 		uint32_t r3;
131 
132 		AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
133 
134 		fpop(lo, hi);
135 		if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
136 			fallbackSimpleSort(fmap, eclass, lo, hi);
137 			continue;
138 		}
139 
140 		/* Random partitioning.  Median of 3 sometimes fails to
141 		 * avoid bad cases.  Median of 9 seems to help but
142 		 * looks rather expensive.  This too seems to work but
143 		 * is cheaper.  Guidance for the magic constants
144 		 * 7621 and 32768 is taken from Sedgewick's algorithms
145 		 * book, chapter 35.
146 		 */
147 		r = ((r * 7621) + 1) % 32768;
148 		r3 = r % 3;
149 		if (r3 == 0)
150 			med = eclass[fmap[lo]];
151 		else if (r3 == 1)
152 			med = eclass[fmap[(lo+hi)>>1]];
153 		else
154 			med = eclass[fmap[hi]];
155 
156 		unLo = ltLo = lo;
157 		unHi = gtHi = hi;
158 
159 		while (1) {
160 			while (1) {
161 				if (unLo > unHi) break;
162 				n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
163 				if (n == 0) {
164 					mswap(fmap[unLo], fmap[ltLo]);
165 					ltLo++;
166 					unLo++;
167 					continue;
168 				}
169 				if (n > 0) break;
170 				unLo++;
171 			}
172 			while (1) {
173 				if (unLo > unHi) break;
174 				n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
175 				if (n == 0) {
176 					mswap(fmap[unHi], fmap[gtHi]);
177 					gtHi--; unHi--;
178 					continue;
179 				}
180 				if (n < 0) break;
181 				unHi--;
182 			}
183 			if (unLo > unHi) break;
184 			mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
185 		}
186 
187 		AssertD(unHi == unLo-1, "fallbackQSort3(2)");
188 
189 		if (gtHi < ltLo) continue;
190 
191 		n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
192 		m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
193 
194 		n = lo + unLo - ltLo - 1;
195 		m = hi - (gtHi - unHi) + 1;
196 
197 		if (n - lo > hi - m) {
198 			fpush(lo, n);
199 			fpush(m, hi);
200 		} else {
201 			fpush(m, hi);
202 			fpush(lo, n);
203 		}
204 	}
205 }
206 
207 #undef fpush
208 #undef fpop
209 #undef FALLBACK_QSORT_SMALL_THRESH
210 #undef FALLBACK_QSORT_STACK_SIZE
211 
212 
213 /*---------------------------------------------*/
214 /* Pre:
215  *	nblock > 0
216  *	eclass exists for [0 .. nblock-1]
217  *	((uint8_t*)eclass) [0 .. nblock-1] holds block
218  *	ptr exists for [0 .. nblock-1]
219  *
220  * Post:
221  *	((uint8_t*)eclass) [0 .. nblock-1] holds block
222  *	All other areas of eclass destroyed
223  *	fmap [0 .. nblock-1] holds sorted order
224  *	bhtab[0 .. 2+(nblock/32)] destroyed
225 */
226 
227 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
228 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
229 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
230 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
231 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
232 
233 static
fallbackSort(EState * state)234 void fallbackSort(EState* state)
235 {
236 	int32_t ftab[257];
237 	int32_t ftabCopy[256];
238 	int32_t H, i, j, k, l, r, cc, cc1;
239 	int32_t nNotDone;
240 	int32_t nBhtab;
241 	/* params */
242 	uint32_t *const fmap    = state->arr1;
243 	uint32_t *const eclass  = state->arr2;
244 #define eclass8 ((uint8_t*)eclass)
245 	uint32_t *const bhtab   = state->ftab;
246 	const int32_t   nblock  = state->nblock;
247 
248 	/*
249 	 * Initial 1-char radix sort to generate
250 	 * initial fmap and initial BH bits.
251 	 */
252 	for (i = 0; i < 257;    i++) ftab[i] = 0;
253 	for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
254 	for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
255 
256 	j = ftab[0];  /* bbox: optimized */
257 	for (i = 1; i < 257;    i++) {
258 		j += ftab[i];
259 		ftab[i] = j;
260 	}
261 
262 	for (i = 0; i < nblock; i++) {
263 		j = eclass8[i];
264 		k = ftab[j] - 1;
265 		ftab[j] = k;
266 		fmap[k] = i;
267 	}
268 
269 	nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
270 	for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
271 	for (i = 0; i < 256; i++) SET_BH(ftab[i]);
272 
273 	/*
274 	 * Inductively refine the buckets.  Kind-of an
275 	 * "exponential radix sort" (!), inspired by the
276 	 * Manber-Myers suffix array construction algorithm.
277 	 */
278 
279 	/*-- set sentinel bits for block-end detection --*/
280 	for (i = 0; i < 32; i++) {
281 		SET_BH(nblock + 2*i);
282 		CLEAR_BH(nblock + 2*i + 1);
283 	}
284 
285 	/*-- the log(N) loop --*/
286 	H = 1;
287 	while (1) {
288 		j = 0;
289 		for (i = 0; i < nblock; i++) {
290 			if (ISSET_BH(i))
291 				j = i;
292 			k = fmap[i] - H;
293 			if (k < 0)
294 				k += nblock;
295 			eclass[k] = j;
296 		}
297 
298 		nNotDone = 0;
299 		r = -1;
300 		while (1) {
301 
302 		/*-- find the next non-singleton bucket --*/
303 			k = r + 1;
304 			while (ISSET_BH(k) && UNALIGNED_BH(k))
305 				k++;
306 			if (ISSET_BH(k)) {
307 				while (WORD_BH(k) == 0xffffffff) k += 32;
308 				while (ISSET_BH(k)) k++;
309 			}
310 			l = k - 1;
311 			if (l >= nblock)
312 				break;
313 			while (!ISSET_BH(k) && UNALIGNED_BH(k))
314 				k++;
315 			if (!ISSET_BH(k)) {
316 				while (WORD_BH(k) == 0x00000000) k += 32;
317 				while (!ISSET_BH(k)) k++;
318 			}
319 			r = k - 1;
320 			if (r >= nblock)
321 				break;
322 
323 			/*-- now [l, r] bracket current bucket --*/
324 			if (r > l) {
325 				nNotDone += (r - l + 1);
326 				fallbackQSort3(fmap, eclass, l, r);
327 
328 				/*-- scan bucket and generate header bits-- */
329 				cc = -1;
330 				for (i = l; i <= r; i++) {
331 					cc1 = eclass[fmap[i]];
332 					if (cc != cc1) {
333 						SET_BH(i);
334 						cc = cc1;
335 					}
336 				}
337 			}
338 		}
339 
340 		H *= 2;
341 		if (H > nblock || nNotDone == 0)
342 			break;
343 	}
344 
345 	/*
346 	 * Reconstruct the original block in
347 	 * eclass8 [0 .. nblock-1], since the
348 	 * previous phase destroyed it.
349 	 */
350 	j = 0;
351 	for (i = 0; i < nblock; i++) {
352 		while (ftabCopy[j] == 0)
353 			j++;
354 		ftabCopy[j]--;
355 		eclass8[fmap[i]] = (uint8_t)j;
356 	}
357 	AssertH(j < 256, 1005);
358 #undef eclass8
359 }
360 
361 #undef       SET_BH
362 #undef     CLEAR_BH
363 #undef     ISSET_BH
364 #undef      WORD_BH
365 #undef UNALIGNED_BH
366 
367 
368 /*---------------------------------------------*/
369 /*--- The main, O(N^2 log(N)) sorting       ---*/
370 /*--- algorithm.  Faster for "normal"       ---*/
371 /*--- non-repetitive blocks.                ---*/
372 /*---------------------------------------------*/
373 
374 /*---------------------------------------------*/
375 static
376 NOINLINE
mainGtU(EState * state,uint32_t i1,uint32_t i2)377 int mainGtU(EState* state,
378 		uint32_t  i1,
379 		uint32_t  i2)
380 {
381 	int32_t  k;
382 	uint8_t  c1, c2;
383 	uint16_t s1, s2;
384 
385 	uint8_t  *const block    = state->block;
386 	uint16_t *const quadrant = state->quadrant;
387 	const int32_t   nblock   = state->nblock;
388 
389 /* Loop unrolling here is actually very useful
390  * (generated code is much simpler),
391  * code size increase is only 270 bytes (i386)
392  * but speeds up compression 10% overall
393  */
394 
395 #if BZIP2_SPEED >= 1
396 
397 #define TIMES_8(code) \
398 	code; code; code; code; \
399 	code; code; code; code;
400 #define TIMES_12(code) \
401 	code; code; code; code; \
402 	code; code; code; code; \
403 	code; code; code; code;
404 
405 #else
406 
407 #define TIMES_8(code) \
408 { \
409 	int nn = 8; \
410 	do { \
411 		code; \
412 	} while (--nn); \
413 }
414 #define TIMES_12(code) \
415 { \
416 	int nn = 12; \
417 	do { \
418 		code; \
419 	} while (--nn); \
420 }
421 
422 #endif
423 
424 	AssertD(i1 != i2, "mainGtU");
425 	TIMES_12(
426 		c1 = block[i1]; c2 = block[i2];
427 		if (c1 != c2) return (c1 > c2);
428 		i1++; i2++;
429 	)
430 
431 	k = nblock + 8;
432 
433 	do {
434 		TIMES_8(
435 			c1 = block[i1]; c2 = block[i2];
436 			if (c1 != c2) return (c1 > c2);
437 			s1 = quadrant[i1]; s2 = quadrant[i2];
438 			if (s1 != s2) return (s1 > s2);
439 			i1++; i2++;
440 		)
441 
442 		if (i1 >= nblock) i1 -= nblock;
443 		if (i2 >= nblock) i2 -= nblock;
444 
445 		state->budget--;
446 		k -= 8;
447 	} while (k >= 0);
448 
449 	return False;
450 }
451 #undef TIMES_8
452 #undef TIMES_12
453 
454 /*---------------------------------------------*/
455 /*
456  * Knuth's increments seem to work better
457  * than Incerpi-Sedgewick here.  Possibly
458  * because the number of elems to sort is
459  * usually small, typically <= 20.
460  */
461 static
462 const uint32_t incs[14] ALIGN4 = {
463 	1, 4, 13, 40, 121, 364, 1093, 3280,
464 	9841, 29524, 88573, 265720,
465 	797161, 2391484
466 };
467 
468 static
mainSimpleSort(EState * state,int32_t lo,int32_t hi,int32_t d)469 void mainSimpleSort(EState* state,
470 		int32_t   lo,
471 		int32_t   hi,
472 		int32_t   d)
473 {
474 	uint32_t *const ptr = state->ptr;
475 
476 	/* At which increment to start? */
477 	int hp = 0;
478 	{
479 		int bigN = hi - lo;
480 		if (bigN <= 0)
481 			return;
482 		while (incs[hp] <= bigN)
483 			hp++;
484 		hp--;
485 	}
486 
487 	for (; hp >= 0; hp--) {
488 		int32_t i;
489 		unsigned h;
490 
491 		h = incs[hp];
492 		i = lo + h;
493 		while (1) {
494 			unsigned j;
495 			unsigned v;
496 
497 			if (i > hi) break;
498 			v = ptr[i];
499 			j = i;
500 			while (mainGtU(state, ptr[j-h]+d, v+d)) {
501 				ptr[j] = ptr[j-h];
502 				j = j - h;
503 				if (j <= (lo + h - 1)) break;
504 			}
505 			ptr[j] = v;
506 			i++;
507 
508 /* 1.5% overall speedup, +290 bytes */
509 #if BZIP2_SPEED >= 3
510 			/*-- copy 2 --*/
511 			if (i > hi) break;
512 			v = ptr[i];
513 			j = i;
514 			while (mainGtU(state, ptr[j-h]+d, v+d)) {
515 				ptr[j] = ptr[j-h];
516 				j = j - h;
517 				if (j <= (lo + h - 1)) break;
518 			}
519 			ptr[j] = v;
520 			i++;
521 			/*-- copy 3 --*/
522 			if (i > hi) break;
523 			v = ptr[i];
524 			j = i;
525 			while (mainGtU(state, ptr[j-h]+d, v+d)) {
526 				ptr[j] = ptr[j-h];
527 				j = j - h;
528 				if (j <= (lo + h - 1)) break;
529 			}
530 			ptr[j] = v;
531 			i++;
532 #endif
533 			if (state->budget < 0) return;
534 		}
535 	}
536 }
537 
538 
539 /*---------------------------------------------*/
540 /*
541  * The following is an implementation of
542  * an elegant 3-way quicksort for strings,
543  * described in a paper "Fast Algorithms for
544  * Sorting and Searching Strings", by Robert
545  * Sedgewick and Jon L. Bentley.
546  */
547 
548 static
549 ALWAYS_INLINE
mmed3(uint8_t a,uint8_t b,uint8_t c)550 uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
551 {
552 	uint8_t t;
553 	if (a > b) {
554 		t = a;
555 		a = b;
556 		b = t;
557 	}
558 	/* here b >= a */
559 	if (b > c) {
560 		b = c;
561 		if (a > b)
562 			b = a;
563 	}
564 	return b;
565 }
566 
567 #define mpush(lz,hz,dz) \
568 { \
569 	stackLo[sp] = lz; \
570 	stackHi[sp] = hz; \
571 	stackD [sp] = dz; \
572 	sp++; \
573 }
574 
575 #define mpop(lz,hz,dz) \
576 { \
577 	sp--; \
578 	lz = stackLo[sp]; \
579 	hz = stackHi[sp]; \
580 	dz = stackD [sp]; \
581 }
582 
583 #define mnextsize(az) (nextHi[az] - nextLo[az])
584 
585 #define mnextswap(az,bz) \
586 { \
587 	int32_t tz; \
588 	tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
589 	tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
590 	tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
591 }
592 
593 #define MAIN_QSORT_SMALL_THRESH 20
594 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
595 #define MAIN_QSORT_STACK_SIZE   100
596 
597 static NOINLINE
mainQSort3(EState * state,int32_t loSt,int32_t hiSt)598 void mainQSort3(EState* state,
599 		int32_t   loSt,
600 		int32_t   hiSt
601 		/*int32_t dSt*/)
602 {
603 	enum { dSt = BZ_N_RADIX };
604 	int32_t unLo, unHi, ltLo, gtHi, n, m, med;
605 	int32_t sp, lo, hi, d;
606 
607 	int32_t stackLo[MAIN_QSORT_STACK_SIZE];
608 	int32_t stackHi[MAIN_QSORT_STACK_SIZE];
609 	int32_t stackD [MAIN_QSORT_STACK_SIZE];
610 
611 	int32_t nextLo[3];
612 	int32_t nextHi[3];
613 	int32_t nextD [3];
614 
615 	uint32_t *const ptr   = state->ptr;
616 	uint8_t  *const block = state->block;
617 
618 	sp = 0;
619 	mpush(loSt, hiSt, dSt);
620 
621 	while (sp > 0) {
622 		AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
623 
624 		mpop(lo, hi, d);
625 		if (hi - lo < MAIN_QSORT_SMALL_THRESH
626 		 || d > MAIN_QSORT_DEPTH_THRESH
627 		) {
628 			mainSimpleSort(state, lo, hi, d);
629 			if (state->budget < 0)
630 				return;
631 			continue;
632 		}
633 		med = (int32_t)	mmed3(block[ptr[lo          ] + d],
634 		                      block[ptr[hi          ] + d],
635 		                      block[ptr[(lo+hi) >> 1] + d]);
636 
637 		unLo = ltLo = lo;
638 		unHi = gtHi = hi;
639 
640 		while (1) {
641 			while (1) {
642 				if (unLo > unHi)
643 					break;
644 				n = ((int32_t)block[ptr[unLo]+d]) - med;
645 				if (n == 0) {
646 					mswap(ptr[unLo], ptr[ltLo]);
647 					ltLo++;
648 					unLo++;
649 					continue;
650 				}
651 				if (n > 0) break;
652 				unLo++;
653 			}
654 			while (1) {
655 				if (unLo > unHi)
656 					break;
657 				n = ((int32_t)block[ptr[unHi]+d]) - med;
658 				if (n == 0) {
659 					mswap(ptr[unHi], ptr[gtHi]);
660 					gtHi--;
661 					unHi--;
662 					continue;
663 				}
664 				if (n < 0) break;
665 				unHi--;
666 			}
667 			if (unLo > unHi)
668 				break;
669 			mswap(ptr[unLo], ptr[unHi]);
670 			unLo++;
671 			unHi--;
672 		}
673 
674 		AssertD(unHi == unLo-1, "mainQSort3(2)");
675 
676 		if (gtHi < ltLo) {
677 			mpush(lo, hi, d + 1);
678 			continue;
679 		}
680 
681 		n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
682 		m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
683 
684 		n = lo + unLo - ltLo - 1;
685 		m = hi - (gtHi - unHi) + 1;
686 
687 		nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
688 		nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
689 		nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
690 
691 		if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
692 		if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
693 		if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
694 
695 		AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
696 		AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
697 
698 		mpush(nextLo[0], nextHi[0], nextD[0]);
699 		mpush(nextLo[1], nextHi[1], nextD[1]);
700 		mpush(nextLo[2], nextHi[2], nextD[2]);
701 	}
702 }
703 
704 #undef mpush
705 #undef mpop
706 #undef mnextsize
707 #undef mnextswap
708 #undef MAIN_QSORT_SMALL_THRESH
709 #undef MAIN_QSORT_DEPTH_THRESH
710 #undef MAIN_QSORT_STACK_SIZE
711 
712 
713 /*---------------------------------------------*/
714 /* Pre:
715  *	nblock > N_OVERSHOOT
716  *	block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
717  *	((uint8_t*)block32) [0 .. nblock-1] holds block
718  *	ptr exists for [0 .. nblock-1]
719  *
720  * Post:
721  *	((uint8_t*)block32) [0 .. nblock-1] holds block
722  *	All other areas of block32 destroyed
723  *	ftab[0 .. 65536] destroyed
724  *	ptr [0 .. nblock-1] holds sorted order
725  *	if (*budget < 0), sorting was abandoned
726  */
727 
728 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
729 #define SETMASK (1 << 21)
730 #define CLEARMASK (~(SETMASK))
731 
732 static NOINLINE
mainSort(EState * state)733 void mainSort(EState* state)
734 {
735 	int32_t  i, j;
736 	Bool     bigDone[256];
737 	uint8_t  runningOrder[256];
738 	/* bbox: moved to EState to save stack
739 	int32_t  copyStart[256];
740 	int32_t  copyEnd  [256];
741 	*/
742 #define copyStart    (state->mainSort__copyStart)
743 #define copyEnd      (state->mainSort__copyEnd)
744 
745 	uint32_t *const ptr      = state->ptr;
746 	uint8_t  *const block    = state->block;
747 	uint32_t *const ftab     = state->ftab;
748 	const int32_t   nblock   = state->nblock;
749 	uint16_t *const quadrant = state->quadrant;
750 
751 	/*-- set up the 2-byte frequency table --*/
752 	/* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
753 	memset(ftab, 0, 65537 * sizeof(ftab[0]));
754 
755 	j = block[0] << 8;
756 	i = nblock - 1;
757 /* 3%, +300 bytes */
758 #if BZIP2_SPEED >= 2
759 	for (; i >= 3; i -= 4) {
760 		quadrant[i] = 0;
761 		j = (j >> 8) | (((unsigned)block[i]) << 8);
762 		ftab[j]++;
763 		quadrant[i-1] = 0;
764 		j = (j >> 8) | (((unsigned)block[i-1]) << 8);
765 		ftab[j]++;
766 		quadrant[i-2] = 0;
767 		j = (j >> 8) | (((unsigned)block[i-2]) << 8);
768 		ftab[j]++;
769 		quadrant[i-3] = 0;
770 		j = (j >> 8) | (((unsigned)block[i-3]) << 8);
771 		ftab[j]++;
772 	}
773 #endif
774 	for (; i >= 0; i--) {
775 		quadrant[i] = 0;
776 		j = (j >> 8) | (((unsigned)block[i]) << 8);
777 		ftab[j]++;
778 	}
779 
780 	/*-- (emphasises close relationship of block & quadrant) --*/
781 	for (i = 0; i < BZ_N_OVERSHOOT; i++) {
782 		block   [nblock+i] = block[i];
783 		quadrant[nblock+i] = 0;
784 	}
785 
786 	/*-- Complete the initial radix sort --*/
787 	j = ftab[0]; /* bbox: optimized */
788 	for (i = 1; i <= 65536; i++) {
789 		j += ftab[i];
790 		ftab[i] = j;
791 	}
792 
793 	{
794 		unsigned s;
795 		s = block[0] << 8;
796 		i = nblock - 1;
797 #if BZIP2_SPEED >= 2
798 		for (; i >= 3; i -= 4) {
799 			s = (s >> 8) | (block[i] << 8);
800 			j = ftab[s] - 1;
801 			ftab[s] = j;
802 			ptr[j] = i;
803 			s = (s >> 8) | (block[i-1] << 8);
804 			j = ftab[s] - 1;
805 			ftab[s] = j;
806 			ptr[j] = i-1;
807 			s = (s >> 8) | (block[i-2] << 8);
808 			j = ftab[s] - 1;
809 			ftab[s] = j;
810 			ptr[j] = i-2;
811 			s = (s >> 8) | (block[i-3] << 8);
812 			j = ftab[s] - 1;
813 			ftab[s] = j;
814 			ptr[j] = i-3;
815 		}
816 #endif
817 		for (; i >= 0; i--) {
818 			s = (s >> 8) | (block[i] << 8);
819 			j = ftab[s] - 1;
820 			ftab[s] = j;
821 			ptr[j] = i;
822 		}
823 	}
824 
825 	/*
826 	 * Now ftab contains the first loc of every small bucket.
827 	 * Calculate the running order, from smallest to largest
828 	 * big bucket.
829 	 */
830 	for (i = 0; i <= 255; i++) {
831 		bigDone     [i] = False;
832 		runningOrder[i] = i;
833 	}
834 
835 	{
836 		/* bbox: was: int32_t h = 1; */
837 		/* do h = 3 * h + 1; while (h <= 256); */
838 		unsigned h = 364;
839 
840 		do {
841 			/*h = h / 3;*/
842 			h = (h * 171) >> 9; /* bbox: fast h/3 */
843 			for (i = h; i <= 255; i++) {
844 				unsigned vv, jh;
845 				vv = runningOrder[i]; /* uint8[] */
846 				j = i;
847 				while (jh = j - h, BIGFREQ(runningOrder[jh]) > BIGFREQ(vv)) {
848 					runningOrder[j] = runningOrder[jh];
849 					j = jh;
850 					if (j < h)
851 						break;
852 				}
853 				runningOrder[j] = vv;
854 			}
855 		} while (h != 1);
856 	}
857 
858 	/*
859 	 * The main sorting loop.
860 	 */
861 
862 	for (i = 0; /*i <= 255*/; i++) {
863 		unsigned ss;
864 
865 		/*
866 		 * Process big buckets, starting with the least full.
867 		 * Basically this is a 3-step process in which we call
868 		 * mainQSort3 to sort the small buckets [ss, j], but
869 		 * also make a big effort to avoid the calls if we can.
870 		 */
871 		ss = runningOrder[i];
872 
873 		/*
874 		 * Step 1:
875 		 * Complete the big bucket [ss] by quicksorting
876 		 * any unsorted small buckets [ss, j], for j != ss.
877 		 * Hopefully previous pointer-scanning phases have already
878 		 * completed many of the small buckets [ss, j], so
879 		 * we don't have to sort them at all.
880 		 */
881 		for (j = 0; j <= 255; j++) {
882 			if (j != ss) {
883 				unsigned sb;
884 				sb = (ss << 8) + j;
885 				if (!(ftab[sb] & SETMASK)) {
886 					int32_t lo =  ftab[sb] /*& CLEARMASK (redundant)*/;
887 					int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
888 					if (hi > lo) {
889 						mainQSort3(state, lo, hi /*,BZ_N_RADIX*/);
890 						if (state->budget < 0) return;
891 					}
892 				}
893 				ftab[sb] |= SETMASK;
894 			}
895 		}
896 
897 		AssertH(!bigDone[ss], 1006);
898 
899 		/*
900 		 * Step 2:
901 		 * Now scan this big bucket [ss] so as to synthesise the
902 		 * sorted order for small buckets [t, ss] for all t,
903 		 * including, magically, the bucket [ss,ss] too.
904 		 * This will avoid doing Real Work in subsequent Step 1's.
905 		 */
906 		{
907 			for (j = 0; j <= 255; j++) {
908 				copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
909 				copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
910 			}
911 			for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
912 				unsigned c1;
913 				int32_t k;
914 				k = ptr[j] - 1;
915 				if (k < 0)
916 					k += nblock;
917 				c1 = block[k];
918 				if (!bigDone[c1])
919 					ptr[copyStart[c1]++] = k;
920 			}
921 			for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
922 				unsigned c1;
923 				int32_t k;
924 				k = ptr[j]-1;
925 				if (k < 0)
926 					k += nblock;
927 				c1 = block[k];
928 				if (!bigDone[c1])
929 					ptr[copyEnd[c1]--] = k;
930 			}
931 		}
932 
933 		/* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
934 		 * Necessity for this case is demonstrated by compressing
935 		 * a sequence of approximately 48.5 million of character
936 		 * 251; 1.0.0/1.0.1 will then die here. */
937 		AssertH((copyStart[ss]-1 == copyEnd[ss]) \
938 		     || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
939 
940 		for (j = 0; j <= 255; j++)
941 			ftab[(j << 8) + ss] |= SETMASK;
942 
943 		if (i == 255)
944 			break;
945 
946 		/*
947 		 * Step 3:
948 		 * The [ss] big bucket is now done.  Record this fact,
949 		 * and update the quadrant descriptors.  Remember to
950 		 * update quadrants in the overshoot area too, if
951 		 * necessary.  The "if (i < 255)" test merely skips
952 		 * this updating for the last bucket processed, since
953 		 * updating for the last bucket is pointless.
954 		 *
955 		 * The quadrant array provides a way to incrementally
956 		 * cache sort orderings, as they appear, so as to
957 		 * make subsequent comparisons in fullGtU() complete
958 		 * faster.  For repetitive blocks this makes a big
959 		 * difference (but not big enough to be able to avoid
960 		 * the fallback sorting mechanism, exponential radix sort).
961 		 *
962 		 * The precise meaning is: at all times:
963 		 *
964 		 *	for 0 <= i < nblock and 0 <= j <= nblock
965 		 *
966 		 *	if block[i] != block[j],
967 		 *
968 		 *		then the relative values of quadrant[i] and
969 		 *			  quadrant[j] are meaningless.
970 		 *
971 		 *		else {
972 		 *			if quadrant[i] < quadrant[j]
973 		 *				then the string starting at i lexicographically
974 		 *				precedes the string starting at j
975 		 *
976 		 *			else if quadrant[i] > quadrant[j]
977 		 *				then the string starting at j lexicographically
978 		 *				precedes the string starting at i
979 		 *
980 		 *			else
981 		 *				the relative ordering of the strings starting
982 		 *				at i and j has not yet been determined.
983 		 *		}
984 		 */
985 		bigDone[ss] = True;
986 
987 		{
988 			unsigned bbStart = ftab[ss << 8] & CLEARMASK;
989 			unsigned bbSize  = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
990 			unsigned shifts  = 0;
991 
992 			while ((bbSize >> shifts) > 65534) shifts++;
993 
994 			for (j = bbSize-1; j >= 0; j--) {
995 				unsigned a2update  = ptr[bbStart + j]; /* uint32[] */
996 				uint16_t qVal      = (uint16_t)(j >> shifts);
997 				quadrant[a2update] = qVal;
998 				if (a2update < BZ_N_OVERSHOOT)
999 					quadrant[a2update + nblock] = qVal;
1000 			}
1001 			AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
1002 		}
1003 	}
1004 #undef runningOrder
1005 #undef copyStart
1006 #undef copyEnd
1007 }
1008 
1009 #undef BIGFREQ
1010 #undef SETMASK
1011 #undef CLEARMASK
1012 
1013 
1014 /*---------------------------------------------*/
1015 /* Pre:
1016  *	nblock > 0
1017  *	arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1018  *	  ((uint8_t*)arr2)[0 .. nblock-1] holds block
1019  *	arr1 exists for [0 .. nblock-1]
1020  *
1021  * Post:
1022  *	((uint8_t*)arr2) [0 .. nblock-1] holds block
1023  *	All other areas of block destroyed
1024  *	ftab[0 .. 65536] destroyed
1025  *	arr1[0 .. nblock-1] holds sorted order
1026  */
1027 static NOINLINE
BZ2_blockSort(EState * state)1028 int32_t BZ2_blockSort(EState* state)
1029 {
1030 	/* In original bzip2 1.0.4, it's a parameter, but 30
1031 	 * (which was the default) should work ok. */
1032 	enum { wfact = 30 };
1033 	unsigned i;
1034 	int32_t origPtr = origPtr;
1035 
1036 	if (state->nblock >= 10000) {
1037 		/* Calculate the location for quadrant, remembering to get
1038 		 * the alignment right.  Assumes that &(block[0]) is at least
1039 		 * 2-byte aligned -- this should be ok since block is really
1040 		 * the first section of arr2.
1041 		 */
1042 		i = state->nblock + BZ_N_OVERSHOOT;
1043 		if (i & 1)
1044 			i++;
1045 		state->quadrant = (uint16_t*) &(state->block[i]);
1046 
1047 		/* (wfact-1) / 3 puts the default-factor-30
1048 		 * transition point at very roughly the same place as
1049 		 * with v0.1 and v0.9.0.
1050 		 * Not that it particularly matters any more, since the
1051 		 * resulting compressed stream is now the same regardless
1052 		 * of whether or not we use the main sort or fallback sort.
1053 		 */
1054 		state->budget = state->nblock * ((wfact-1) / 3);
1055 		mainSort(state);
1056 		if (state->budget >= 0)
1057 			goto good;
1058 	}
1059 	fallbackSort(state);
1060  good:
1061 
1062 #if BZ_LIGHT_DEBUG
1063 	origPtr = -1;
1064 #endif
1065 	for (i = 0; i < state->nblock; i++) {
1066 		if (state->ptr[i] == 0) {
1067 			origPtr = i;
1068 			break;
1069 		}
1070 	}
1071 
1072 	AssertH(origPtr != -1, 1003);
1073 	return origPtr;
1074 }
1075 
1076 
1077 /*-------------------------------------------------------------*/
1078 /*--- end                                       blocksort.c ---*/
1079 /*-------------------------------------------------------------*/
1080