1/* 2 * Optmized version of the ip_fast_csum() function 3 * Used for calculating IP header checksum 4 * 5 * Return: 16bit checksum, complemented 6 * 7 * Inputs: 8 * in0: address of buffer to checksum (char *) 9 * in1: length of the buffer (int) 10 * 11 * Copyright (C) 2002 Intel Corp. 12 * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> 13 */ 14 15#include <asm/asmmacro.h> 16 17/* 18 * Since we know that most likely this function is called with buf aligned 19 * on 4-byte boundary and 20 bytes in length, we can execution rather quickly 20 * versus calling generic version of do_csum, which has lots of overhead in 21 * handling various alignments and sizes. However, due to lack of constrains 22 * put on the function input argument, cases with alignment not on 4-byte or 23 * size not equal to 20 bytes will be handled by the generic do_csum function. 24 */ 25 26#define in0 r32 27#define in1 r33 28#define ret0 r8 29 30GLOBAL_ENTRY(ip_fast_csum) 31 .prologue 32 .body 33 cmp.ne p6,p7=5,in1 // size other than 20 byte? 34 and r14=3,in0 // is it aligned on 4-byte? 35 add r15=4,in0 // second source pointer 36 ;; 37 cmp.ne.or.andcm p6,p7=r14,r0 38 ;; 39(p7) ld4 r20=[in0],8 40(p7) ld4 r21=[r15],8 41(p6) br.spnt .generic 42 ;; 43 ld4 r22=[in0],8 44 ld4 r23=[r15],8 45 ;; 46 ld4 r24=[in0] 47 add r20=r20,r21 48 add r22=r22,r23 49 ;; 50 add r20=r20,r22 51 ;; 52 add r20=r20,r24 53 ;; 54 shr.u ret0=r20,16 // now need to add the carry 55 zxt2 r20=r20 56 ;; 57 add r20=ret0,r20 58 ;; 59 shr.u ret0=r20,16 // add carry again 60 zxt2 r20=r20 61 ;; 62 add r20=ret0,r20 63 ;; 64 shr.u ret0=r20,16 65 zxt2 r20=r20 66 ;; 67 add r20=ret0,r20 68 ;; 69 andcm ret0=-1,r20 70 .restore sp // reset frame state 71 br.ret.sptk.many b0 72 ;; 73 74.generic: 75 .prologue 76 .save ar.pfs, r35 77 alloc r35=ar.pfs,2,2,2,0 78 .save rp, r34 79 mov r34=b0 80 .body 81 dep.z out1=in1,2,30 82 mov out0=in0 83 ;; 84 br.call.sptk.many b0=do_csum 85 ;; 86 andcm ret0=-1,ret0 87 mov ar.pfs=r35 88 mov b0=r34 89 br.ret.sptk.many b0 90END(ip_fast_csum) 91