1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 /* P9 gzip sample code for demonstrating the P9 NX hardware interface.
4  * Not intended for productive uses or for performance or compression
5  * ratio measurements.  For simplicity of demonstration, this sample
6  * code compresses in to fixed Huffman blocks only (Deflate btype=1)
7  * and has very simple memory management.  Dynamic Huffman blocks
8  * (Deflate btype=2) are more involved as detailed in the user guide.
9  * Note also that /dev/crypto/gzip, VAS and skiboot support are
10  * required.
11  *
12  * Copyright 2020 IBM Corp.
13  *
14  * https://github.com/libnxz/power-gzip for zlib api and other utils
15  *
16  * Author: Bulent Abali <abali@us.ibm.com>
17  *
18  * Definitions of acronyms used here. See
19  * P9 NX Gzip Accelerator User's Manual for details:
20  * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf
21  *
22  * adler/crc: 32 bit checksums appended to stream tail
23  * ce:       completion extension
24  * cpb:      coprocessor parameter block (metadata)
25  * crb:      coprocessor request block (command)
26  * csb:      coprocessor status block (status)
27  * dht:      dynamic huffman table
28  * dde:      data descriptor element (address, length)
29  * ddl:      list of ddes
30  * dh/fh:    dynamic and fixed huffman types
31  * fc:       coprocessor function code
32  * histlen:  history/dictionary length
33  * history:  sliding window of up to 32KB of data
34  * lzcount:  Deflate LZ symbol counts
35  * rembytecnt: remaining byte count
36  * sfbt:     source final block type; last block's type during decomp
37  * spbc:     source processed byte count
38  * subc:     source unprocessed bit count
39  * tebc:     target ending bit count; valid bits in the last byte
40  * tpbc:     target processed byte count
41  * vas:      virtual accelerator switch; the user mode interface
42  */
43 
44 #define _ISOC11_SOURCE	// For aligned_alloc()
45 #define _DEFAULT_SOURCE	// For endian.h
46 
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <unistd.h>
51 #include <stdint.h>
52 #include <sys/types.h>
53 #include <sys/stat.h>
54 #include <sys/time.h>
55 #include <sys/fcntl.h>
56 #include <sys/mman.h>
57 #include <endian.h>
58 #include <bits/endian.h>
59 #include <sys/ioctl.h>
60 #include <assert.h>
61 #include <errno.h>
62 #include <signal.h>
63 #include "utils.h"
64 #include "nxu.h"
65 #include "nx.h"
66 
67 int nx_dbg;
68 FILE *nx_gzip_log;
69 
70 #define NX_MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
71 #define FNAME_MAX 1024
72 #define FEXT ".nx.gz"
73 
74 #define SYSFS_MAX_REQ_BUF_PATH "devices/vio/ibm,compression-v1/nx_gzip_caps/req_max_processed_len"
75 
76 /*
77  * LZ counts returned in the user supplied nx_gzip_crb_cpb_t structure.
78  */
compress_fht_sample(char * src,uint32_t srclen,char * dst,uint32_t dstlen,int with_count,struct nx_gzip_crb_cpb_t * cmdp,void * handle)79 static int compress_fht_sample(char *src, uint32_t srclen, char *dst,
80 				uint32_t dstlen, int with_count,
81 				struct nx_gzip_crb_cpb_t *cmdp, void *handle)
82 {
83 	uint32_t fc;
84 
85 	assert(!!cmdp);
86 
87 	put32(cmdp->crb, gzip_fc, 0);  /* clear */
88 	fc = (with_count) ? GZIP_FC_COMPRESS_RESUME_FHT_COUNT :
89 			    GZIP_FC_COMPRESS_RESUME_FHT;
90 	putnn(cmdp->crb, gzip_fc, fc);
91 	putnn(cmdp->cpb, in_histlen, 0); /* resuming with no history */
92 	memset((void *) &cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
93 
94 	/* Section 6.6 programming notes; spbc may be in two different
95 	 * places depending on FC.
96 	 */
97 	if (!with_count)
98 		put32(cmdp->cpb, out_spbc_comp, 0);
99 	else
100 		put32(cmdp->cpb, out_spbc_comp_with_count, 0);
101 
102 	/* Figure 6-3 6-4; CSB location */
103 	put64(cmdp->crb, csb_address, 0);
104 	put64(cmdp->crb, csb_address,
105 	      (uint64_t) &cmdp->crb.csb & csb_address_mask);
106 
107 	/* Source direct dde (scatter-gather list) */
108 	clear_dde(cmdp->crb.source_dde);
109 	putnn(cmdp->crb.source_dde, dde_count, 0);
110 	put32(cmdp->crb.source_dde, ddebc, srclen);
111 	put64(cmdp->crb.source_dde, ddead, (uint64_t) src);
112 
113 	/* Target direct dde (scatter-gather list) */
114 	clear_dde(cmdp->crb.target_dde);
115 	putnn(cmdp->crb.target_dde, dde_count, 0);
116 	put32(cmdp->crb.target_dde, ddebc, dstlen);
117 	put64(cmdp->crb.target_dde, ddead, (uint64_t) dst);
118 
119 	/* Submit the crb, the job descriptor, to the accelerator */
120 	return nxu_submit_job(cmdp, handle);
121 }
122 
123 /*
124  * Prepares a blank no filename no timestamp gzip header and returns
125  * the number of bytes written to buf.
126  * Gzip specification at https://tools.ietf.org/html/rfc1952
127  */
gzip_header_blank(char * buf)128 int gzip_header_blank(char *buf)
129 {
130 	int i = 0;
131 
132 	buf[i++] = 0x1f; /* ID1 */
133 	buf[i++] = 0x8b; /* ID2 */
134 	buf[i++] = 0x08; /* CM  */
135 	buf[i++] = 0x00; /* FLG */
136 	buf[i++] = 0x00; /* MTIME */
137 	buf[i++] = 0x00; /* MTIME */
138 	buf[i++] = 0x00; /* MTIME */
139 	buf[i++] = 0x00; /* MTIME */
140 	buf[i++] = 0x04; /* XFL 4=fastest */
141 	buf[i++] = 0x03; /* OS UNIX */
142 
143 	return i;
144 }
145 
146 /* Caller must free the allocated buffer return nonzero on error. */
read_alloc_input_file(char * fname,char ** buf,size_t * bufsize)147 int read_alloc_input_file(char *fname, char **buf, size_t *bufsize)
148 {
149 	struct stat statbuf;
150 	FILE *fp;
151 	char *p;
152 	size_t num_bytes;
153 
154 	if (stat(fname, &statbuf)) {
155 		perror(fname);
156 		return(-1);
157 	}
158 	fp = fopen(fname, "r");
159 	if (fp == NULL) {
160 		perror(fname);
161 		return(-1);
162 	}
163 	assert(NULL != (p = (char *) malloc(statbuf.st_size)));
164 	num_bytes = fread(p, 1, statbuf.st_size, fp);
165 	if (ferror(fp) || (num_bytes != statbuf.st_size)) {
166 		perror(fname);
167 		return(-1);
168 	}
169 	*buf = p;
170 	*bufsize = num_bytes;
171 	return 0;
172 }
173 
174 /* Returns nonzero on error */
write_output_file(char * fname,char * buf,size_t bufsize)175 int write_output_file(char *fname, char *buf, size_t bufsize)
176 {
177 	FILE *fp;
178 	size_t num_bytes;
179 
180 	fp = fopen(fname, "w");
181 	if (fp == NULL) {
182 		perror(fname);
183 		return(-1);
184 	}
185 	num_bytes = fwrite(buf, 1, bufsize, fp);
186 	if (ferror(fp) || (num_bytes != bufsize)) {
187 		perror(fname);
188 		return(-1);
189 	}
190 	fclose(fp);
191 	return 0;
192 }
193 
194 /*
195  * Z_SYNC_FLUSH as described in zlib.h.
196  * Returns number of appended bytes
197  */
append_sync_flush(char * buf,int tebc,int final)198 int append_sync_flush(char *buf, int tebc, int final)
199 {
200 	uint64_t flush;
201 	int shift = (tebc & 0x7);
202 
203 	if (tebc > 0) {
204 		/* Last byte is partially full */
205 		buf = buf - 1;
206 		*buf = *buf & (unsigned char) ((1<<tebc)-1);
207 	} else
208 		*buf = 0;
209 	flush = ((0x1ULL & final) << shift) | *buf;
210 	shift = shift + 3; /* BFINAL and BTYPE written */
211 	shift = (shift <= 8) ? 8 : 16;
212 	flush |= (0xFFFF0000ULL) << shift; /* Zero length block */
213 	shift = shift + 32;
214 	while (shift > 0) {
215 		*buf++ = (unsigned char) (flush & 0xffULL);
216 		flush = flush >> 8;
217 		shift = shift - 8;
218 	}
219 	return(((tebc > 5) || (tebc == 0)) ? 5 : 4);
220 }
221 
222 /*
223  * Final deflate block bit.  This call assumes the block
224  * beginning is byte aligned.
225  */
set_bfinal(void * buf,int bfinal)226 static void set_bfinal(void *buf, int bfinal)
227 {
228 	char *b = buf;
229 
230 	if (bfinal)
231 		*b = *b | (unsigned char) 0x01;
232 	else
233 		*b = *b & (unsigned char) 0xfe;
234 }
235 
compress_file(int argc,char ** argv,void * handle)236 int compress_file(int argc, char **argv, void *handle)
237 {
238 	char *inbuf, *outbuf, *srcbuf, *dstbuf;
239 	char outname[FNAME_MAX];
240 	uint32_t srclen, dstlen;
241 	uint32_t flushlen, chunk;
242 	size_t inlen, outlen, dsttotlen, srctotlen;
243 	uint32_t crc, spbc, tpbc, tebc;
244 	int lzcounts = 0;
245 	int cc;
246 	int num_hdr_bytes;
247 	struct nx_gzip_crb_cpb_t *cmdp;
248 	uint32_t pagelen = 65536;
249 	int fault_tries = NX_MAX_FAULTS;
250 	char buf[32];
251 
252 	cmdp = (void *)(uintptr_t)
253 		aligned_alloc(sizeof(struct nx_gzip_crb_cpb_t),
254 			      sizeof(struct nx_gzip_crb_cpb_t));
255 
256 	if (argc != 2) {
257 		fprintf(stderr, "usage: %s <fname>\n", argv[0]);
258 		exit(-1);
259 	}
260 	if (read_alloc_input_file(argv[1], &inbuf, &inlen))
261 		exit(-1);
262 	fprintf(stderr, "file %s read, %ld bytes\n", argv[1], inlen);
263 
264 	/* Generous output buffer for header/trailer */
265 	outlen = 2 * inlen + 1024;
266 
267 	assert(NULL != (outbuf = (char *)malloc(outlen)));
268 	nxu_touch_pages(outbuf, outlen, pagelen, 1);
269 
270 	/*
271 	 * On PowerVM, the hypervisor defines the maximum request buffer
272 	 * size is defined and this value is available via sysfs.
273 	 */
274 	if (!read_sysfs_file(SYSFS_MAX_REQ_BUF_PATH, buf, sizeof(buf))) {
275 		chunk = atoi(buf);
276 	} else {
277 		/* sysfs entry is not available on PowerNV */
278 		/* Compress piecemeal in smallish chunks */
279 		chunk = 1<<22;
280 	}
281 
282 	/* Write the gzip header to the stream */
283 	num_hdr_bytes = gzip_header_blank(outbuf);
284 	dstbuf    = outbuf + num_hdr_bytes;
285 	outlen    = outlen - num_hdr_bytes;
286 	dsttotlen = num_hdr_bytes;
287 
288 	srcbuf    = inbuf;
289 	srctotlen = 0;
290 
291 	/* Init the CRB, the coprocessor request block */
292 	memset(&cmdp->crb, 0, sizeof(cmdp->crb));
293 
294 	/* Initial gzip crc32 */
295 	put32(cmdp->cpb, in_crc, 0);
296 
297 	while (inlen > 0) {
298 
299 		/* Submit chunk size source data per job */
300 		srclen = NX_MIN(chunk, inlen);
301 		/* Supply large target in case data expands */
302 		dstlen = NX_MIN(2*srclen, outlen);
303 
304 		/* Page faults are handled by the user code */
305 
306 		/* Fault-in pages; an improved code wouldn't touch so
307 		 * many pages but would try to estimate the
308 		 * compression ratio and adjust both the src and dst
309 		 * touch amounts.
310 		 */
311 		nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), pagelen,
312 				1);
313 		nxu_touch_pages(srcbuf, srclen, pagelen, 0);
314 		nxu_touch_pages(dstbuf, dstlen, pagelen, 1);
315 
316 		cc = compress_fht_sample(
317 			srcbuf, srclen,
318 			dstbuf, dstlen,
319 			lzcounts, cmdp, handle);
320 
321 		if (cc != ERR_NX_OK && cc != ERR_NX_TPBC_GT_SPBC &&
322 		    cc != ERR_NX_AT_FAULT) {
323 			fprintf(stderr, "nx error: cc= %d\n", cc);
324 			exit(-1);
325 		}
326 
327 		/* Page faults are handled by the user code */
328 		if (cc == ERR_NX_AT_FAULT) {
329 			NXPRT(fprintf(stderr, "page fault: cc= %d, ", cc));
330 			NXPRT(fprintf(stderr, "try= %d, fsa= %08llx\n",
331 				  fault_tries,
332 				  (unsigned long long) cmdp->crb.csb.fsaddr));
333 			fault_tries--;
334 			if (fault_tries > 0) {
335 				continue;
336 			} else {
337 				fprintf(stderr, "error: cannot progress; ");
338 				fprintf(stderr, "too many faults\n");
339 				exit(-1);
340 			}
341 		}
342 
343 		fault_tries = NX_MAX_FAULTS; /* Reset for the next chunk */
344 
345 		inlen     = inlen - srclen;
346 		srcbuf    = srcbuf + srclen;
347 		srctotlen = srctotlen + srclen;
348 
349 		/* Two possible locations for spbc depending on the function
350 		 * code.
351 		 */
352 		spbc = (!lzcounts) ? get32(cmdp->cpb, out_spbc_comp) :
353 			get32(cmdp->cpb, out_spbc_comp_with_count);
354 		assert(spbc == srclen);
355 
356 		/* Target byte count */
357 		tpbc = get32(cmdp->crb.csb, tpbc);
358 		/* Target ending bit count */
359 		tebc = getnn(cmdp->cpb, out_tebc);
360 		NXPRT(fprintf(stderr, "compressed chunk %d ", spbc));
361 		NXPRT(fprintf(stderr, "to %d bytes, tebc= %d\n", tpbc, tebc));
362 
363 		if (inlen > 0) { /* More chunks to go */
364 			set_bfinal(dstbuf, 0);
365 			dstbuf    = dstbuf + tpbc;
366 			dsttotlen = dsttotlen + tpbc;
367 			outlen    = outlen - tpbc;
368 			/* Round up to the next byte with a flush
369 			 * block; do not set the BFINAqL bit.
370 			 */
371 			flushlen  = append_sync_flush(dstbuf, tebc, 0);
372 			dsttotlen = dsttotlen + flushlen;
373 			outlen    = outlen - flushlen;
374 			dstbuf    = dstbuf + flushlen;
375 			NXPRT(fprintf(stderr, "added sync_flush %d bytes\n",
376 					flushlen));
377 		} else {  /* Done */
378 			/* Set the BFINAL bit of the last block per Deflate
379 			 * specification.
380 			 */
381 			set_bfinal(dstbuf, 1);
382 			dstbuf    = dstbuf + tpbc;
383 			dsttotlen = dsttotlen + tpbc;
384 			outlen    = outlen - tpbc;
385 		}
386 
387 		/* Resuming crc32 for the next chunk */
388 		crc = get32(cmdp->cpb, out_crc);
389 		put32(cmdp->cpb, in_crc, crc);
390 		crc = be32toh(crc);
391 	}
392 
393 	/* Append crc32 and ISIZE to the end */
394 	memcpy(dstbuf, &crc, 4);
395 	memcpy(dstbuf+4, &srctotlen, 4);
396 	dsttotlen = dsttotlen + 8;
397 	outlen    = outlen - 8;
398 
399 	assert(FNAME_MAX > (strlen(argv[1]) + strlen(FEXT)));
400 	strcpy(outname, argv[1]);
401 	strcat(outname, FEXT);
402 	if (write_output_file(outname, outbuf, dsttotlen)) {
403 		fprintf(stderr, "write error: %s\n", outname);
404 		exit(-1);
405 	}
406 
407 	fprintf(stderr, "compressed %ld to %ld bytes total, ", srctotlen,
408 		dsttotlen);
409 	fprintf(stderr, "crc32 checksum = %08x\n", crc);
410 
411 	if (inbuf != NULL)
412 		free(inbuf);
413 
414 	if (outbuf != NULL)
415 		free(outbuf);
416 
417 	return 0;
418 }
419 
main(int argc,char ** argv)420 int main(int argc, char **argv)
421 {
422 	int rc;
423 	struct sigaction act;
424 	void *handle;
425 
426 	nx_dbg = 0;
427 	nx_gzip_log = NULL;
428 	act.sa_handler = 0;
429 	act.sa_sigaction = nxu_sigsegv_handler;
430 	act.sa_flags = SA_SIGINFO;
431 	act.sa_restorer = 0;
432 	sigemptyset(&act.sa_mask);
433 	sigaction(SIGSEGV, &act, NULL);
434 
435 	handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
436 	if (!handle) {
437 		fprintf(stderr, "Unable to init NX, errno %d\n", errno);
438 		exit(-1);
439 	}
440 
441 	rc = compress_file(argc, argv, handle);
442 
443 	nx_function_end(handle);
444 
445 	return rc;
446 }
447