1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2 
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5 
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/net.h>
9 #include <linux/scatterlist.h>
10 #include <linux/highmem.h>
11 
12 #include <rdma/iw_cm.h>
13 #include <rdma/ib_verbs.h>
14 
15 #include "siw.h"
16 #include "siw_verbs.h"
17 #include "siw_mem.h"
18 
19 /*
20  * siw_rx_umem()
21  *
22  * Receive data of @len into target referenced by @dest_addr.
23  *
24  * @srx:	Receive Context
25  * @umem:	siw representation of target memory
26  * @dest_addr:	user virtual address
27  * @len:	number of bytes to place
28  */
siw_rx_umem(struct siw_rx_stream * srx,struct siw_umem * umem,u64 dest_addr,int len)29 static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
30 		       u64 dest_addr, int len)
31 {
32 	int copied = 0;
33 
34 	while (len) {
35 		struct page *p;
36 		int pg_off, bytes, rv;
37 		void *dest;
38 
39 		p = siw_get_upage(umem, dest_addr);
40 		if (unlikely(!p)) {
41 			pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
42 				__func__, qp_id(rx_qp(srx)),
43 				(void *)(uintptr_t)dest_addr,
44 				(void *)(uintptr_t)umem->fp_addr);
45 			/* siw internal error */
46 			srx->skb_copied += copied;
47 			srx->skb_new -= copied;
48 
49 			return -EFAULT;
50 		}
51 		pg_off = dest_addr & ~PAGE_MASK;
52 		bytes = min(len, (int)PAGE_SIZE - pg_off);
53 
54 		siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
55 
56 		dest = kmap_atomic(p);
57 		rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
58 				   bytes);
59 
60 		if (unlikely(rv)) {
61 			kunmap_atomic(dest);
62 			srx->skb_copied += copied;
63 			srx->skb_new -= copied;
64 
65 			pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
66 				qp_id(rx_qp(srx)), __func__, len, p, rv);
67 
68 			return -EFAULT;
69 		}
70 		if (srx->mpa_crc_hd) {
71 			if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) {
72 				crypto_shash_update(srx->mpa_crc_hd,
73 					(u8 *)(dest + pg_off), bytes);
74 				kunmap_atomic(dest);
75 			} else {
76 				kunmap_atomic(dest);
77 				/*
78 				 * Do CRC on original, not target buffer.
79 				 * Some user land applications may
80 				 * concurrently write the target buffer,
81 				 * which would yield a broken CRC.
82 				 * Walking the skb twice is very ineffcient.
83 				 * Folding the CRC into skb_copy_bits()
84 				 * would be much better, but is currently
85 				 * not supported.
86 				 */
87 				siw_crc_skb(srx, bytes);
88 			}
89 		} else {
90 			kunmap_atomic(dest);
91 		}
92 		srx->skb_offset += bytes;
93 		copied += bytes;
94 		len -= bytes;
95 		dest_addr += bytes;
96 		pg_off = 0;
97 	}
98 	srx->skb_copied += copied;
99 	srx->skb_new -= copied;
100 
101 	return copied;
102 }
103 
siw_rx_kva(struct siw_rx_stream * srx,void * kva,int len)104 static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
105 {
106 	int rv;
107 
108 	siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
109 
110 	rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
111 	if (unlikely(rv)) {
112 		pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
113 			qp_id(rx_qp(srx)), __func__, len, kva, rv);
114 
115 		return rv;
116 	}
117 	if (srx->mpa_crc_hd)
118 		crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
119 
120 	srx->skb_offset += len;
121 	srx->skb_copied += len;
122 	srx->skb_new -= len;
123 
124 	return len;
125 }
126 
siw_rx_pbl(struct siw_rx_stream * srx,int * pbl_idx,struct siw_mem * mem,u64 addr,int len)127 static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
128 		      struct siw_mem *mem, u64 addr, int len)
129 {
130 	struct siw_pbl *pbl = mem->pbl;
131 	u64 offset = addr - mem->va;
132 	int copied = 0;
133 
134 	while (len) {
135 		int bytes;
136 		dma_addr_t buf_addr =
137 			siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
138 		if (!buf_addr)
139 			break;
140 
141 		bytes = min(bytes, len);
142 		if (siw_rx_kva(srx, ib_virt_dma_to_ptr(buf_addr), bytes) ==
143 		    bytes) {
144 			copied += bytes;
145 			offset += bytes;
146 			len -= bytes;
147 		} else {
148 			break;
149 		}
150 	}
151 	return copied;
152 }
153 
154 /*
155  * siw_rresp_check_ntoh()
156  *
157  * Check incoming RRESP fragment header against expected
158  * header values and update expected values for potential next
159  * fragment.
160  *
161  * NOTE: This function must be called only if a RRESP DDP segment
162  *       starts but not for fragmented consecutive pieces of an
163  *       already started DDP segment.
164  */
siw_rresp_check_ntoh(struct siw_rx_stream * srx,struct siw_rx_fpdu * frx)165 static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
166 				struct siw_rx_fpdu *frx)
167 {
168 	struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
169 	struct siw_wqe *wqe = &frx->wqe_active;
170 	enum ddp_ecode ecode;
171 
172 	u32 sink_stag = be32_to_cpu(rresp->sink_stag);
173 	u64 sink_to = be64_to_cpu(rresp->sink_to);
174 
175 	if (frx->first_ddp_seg) {
176 		srx->ddp_stag = wqe->sqe.sge[0].lkey;
177 		srx->ddp_to = wqe->sqe.sge[0].laddr;
178 		frx->pbl_idx = 0;
179 	}
180 	/* Below checks extend beyond the semantics of DDP, and
181 	 * into RDMAP:
182 	 * We check if the read response matches exactly the
183 	 * read request which was send to the remote peer to
184 	 * trigger this read response. RFC5040/5041 do not
185 	 * always have a proper error code for the detected
186 	 * error cases. We choose 'base or bounds error' for
187 	 * cases where the inbound STag is valid, but offset
188 	 * or length do not match our response receive state.
189 	 */
190 	if (unlikely(srx->ddp_stag != sink_stag)) {
191 		pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
192 			qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
193 		ecode = DDP_ECODE_T_INVALID_STAG;
194 		goto error;
195 	}
196 	if (unlikely(srx->ddp_to != sink_to)) {
197 		pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
198 			qp_id(rx_qp(srx)), (unsigned long long)sink_to,
199 			(unsigned long long)srx->ddp_to);
200 		ecode = DDP_ECODE_T_BASE_BOUNDS;
201 		goto error;
202 	}
203 	if (unlikely(!frx->more_ddp_segs &&
204 		     (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
205 		pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
206 			qp_id(rx_qp(srx)),
207 			wqe->processed + srx->fpdu_part_rem, wqe->bytes);
208 		ecode = DDP_ECODE_T_BASE_BOUNDS;
209 		goto error;
210 	}
211 	return 0;
212 error:
213 	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
214 			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
215 	return -EINVAL;
216 }
217 
218 /*
219  * siw_write_check_ntoh()
220  *
221  * Check incoming WRITE fragment header against expected
222  * header values and update expected values for potential next
223  * fragment
224  *
225  * NOTE: This function must be called only if a WRITE DDP segment
226  *       starts but not for fragmented consecutive pieces of an
227  *       already started DDP segment.
228  */
siw_write_check_ntoh(struct siw_rx_stream * srx,struct siw_rx_fpdu * frx)229 static int siw_write_check_ntoh(struct siw_rx_stream *srx,
230 				struct siw_rx_fpdu *frx)
231 {
232 	struct iwarp_rdma_write *write = &srx->hdr.rwrite;
233 	enum ddp_ecode ecode;
234 
235 	u32 sink_stag = be32_to_cpu(write->sink_stag);
236 	u64 sink_to = be64_to_cpu(write->sink_to);
237 
238 	if (frx->first_ddp_seg) {
239 		srx->ddp_stag = sink_stag;
240 		srx->ddp_to = sink_to;
241 		frx->pbl_idx = 0;
242 	} else {
243 		if (unlikely(srx->ddp_stag != sink_stag)) {
244 			pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
245 				qp_id(rx_qp(srx)), sink_stag,
246 				srx->ddp_stag);
247 			ecode = DDP_ECODE_T_INVALID_STAG;
248 			goto error;
249 		}
250 		if (unlikely(srx->ddp_to != sink_to)) {
251 			pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
252 				qp_id(rx_qp(srx)),
253 				(unsigned long long)sink_to,
254 				(unsigned long long)srx->ddp_to);
255 			ecode = DDP_ECODE_T_BASE_BOUNDS;
256 			goto error;
257 		}
258 	}
259 	return 0;
260 error:
261 	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
262 			   DDP_ETYPE_TAGGED_BUF, ecode, 0);
263 	return -EINVAL;
264 }
265 
266 /*
267  * siw_send_check_ntoh()
268  *
269  * Check incoming SEND fragment header against expected
270  * header values and update expected MSN if no next
271  * fragment expected
272  *
273  * NOTE: This function must be called only if a SEND DDP segment
274  *       starts but not for fragmented consecutive pieces of an
275  *       already started DDP segment.
276  */
siw_send_check_ntoh(struct siw_rx_stream * srx,struct siw_rx_fpdu * frx)277 static int siw_send_check_ntoh(struct siw_rx_stream *srx,
278 			       struct siw_rx_fpdu *frx)
279 {
280 	struct iwarp_send_inv *send = &srx->hdr.send_inv;
281 	struct siw_wqe *wqe = &frx->wqe_active;
282 	enum ddp_ecode ecode;
283 
284 	u32 ddp_msn = be32_to_cpu(send->ddp_msn);
285 	u32 ddp_mo = be32_to_cpu(send->ddp_mo);
286 	u32 ddp_qn = be32_to_cpu(send->ddp_qn);
287 
288 	if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
289 		pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
290 			qp_id(rx_qp(srx)), ddp_qn);
291 		ecode = DDP_ECODE_UT_INVALID_QN;
292 		goto error;
293 	}
294 	if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
295 		pr_warn("siw: [QP %u]: send msn: %u != %u\n",
296 			qp_id(rx_qp(srx)), ddp_msn,
297 			srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
298 		ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
299 		goto error;
300 	}
301 	if (unlikely(ddp_mo != wqe->processed)) {
302 		pr_warn("siw: [QP %u], send mo: %u != %u\n",
303 			qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
304 		ecode = DDP_ECODE_UT_INVALID_MO;
305 		goto error;
306 	}
307 	if (frx->first_ddp_seg) {
308 		/* initialize user memory write position */
309 		frx->sge_idx = 0;
310 		frx->sge_off = 0;
311 		frx->pbl_idx = 0;
312 
313 		/* only valid for SEND_INV and SEND_SE_INV operations */
314 		srx->inval_stag = be32_to_cpu(send->inval_stag);
315 	}
316 	if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
317 		siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
318 			   wqe->bytes, wqe->processed, srx->fpdu_part_rem);
319 		wqe->wc_status = SIW_WC_LOC_LEN_ERR;
320 		ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
321 		goto error;
322 	}
323 	return 0;
324 error:
325 	siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
326 			   DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
327 	return -EINVAL;
328 }
329 
siw_rqe_get(struct siw_qp * qp)330 static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
331 {
332 	struct siw_rqe *rqe;
333 	struct siw_srq *srq;
334 	struct siw_wqe *wqe = NULL;
335 	bool srq_event = false;
336 	unsigned long flags;
337 
338 	srq = qp->srq;
339 	if (srq) {
340 		spin_lock_irqsave(&srq->lock, flags);
341 		if (unlikely(!srq->num_rqe))
342 			goto out;
343 
344 		rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
345 	} else {
346 		if (unlikely(!qp->recvq))
347 			goto out;
348 
349 		rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
350 	}
351 	if (likely(rqe->flags == SIW_WQE_VALID)) {
352 		int num_sge = rqe->num_sge;
353 
354 		if (likely(num_sge <= SIW_MAX_SGE)) {
355 			int i = 0;
356 
357 			wqe = rx_wqe(&qp->rx_untagged);
358 			rx_type(wqe) = SIW_OP_RECEIVE;
359 			wqe->wr_status = SIW_WR_INPROGRESS;
360 			wqe->bytes = 0;
361 			wqe->processed = 0;
362 
363 			wqe->rqe.id = rqe->id;
364 			wqe->rqe.num_sge = num_sge;
365 
366 			while (i < num_sge) {
367 				wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
368 				wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
369 				wqe->rqe.sge[i].length = rqe->sge[i].length;
370 				wqe->bytes += wqe->rqe.sge[i].length;
371 				wqe->mem[i] = NULL;
372 				i++;
373 			}
374 			/* can be re-used by appl */
375 			smp_store_mb(rqe->flags, 0);
376 		} else {
377 			siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
378 			if (srq)
379 				spin_unlock_irqrestore(&srq->lock, flags);
380 			return NULL;
381 		}
382 		if (!srq) {
383 			qp->rq_get++;
384 		} else {
385 			if (srq->armed) {
386 				/* Test SRQ limit */
387 				u32 off = (srq->rq_get + srq->limit) %
388 					  srq->num_rqe;
389 				struct siw_rqe *rqe2 = &srq->recvq[off];
390 
391 				if (!(rqe2->flags & SIW_WQE_VALID)) {
392 					srq->armed = false;
393 					srq_event = true;
394 				}
395 			}
396 			srq->rq_get++;
397 		}
398 	}
399 out:
400 	if (srq) {
401 		spin_unlock_irqrestore(&srq->lock, flags);
402 		if (srq_event)
403 			siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
404 	}
405 	return wqe;
406 }
407 
408 /*
409  * siw_proc_send:
410  *
411  * Process one incoming SEND and place data into memory referenced by
412  * receive wqe.
413  *
414  * Function supports partially received sends (suspending/resuming
415  * current receive wqe processing)
416  *
417  * return value:
418  *	0:       reached the end of a DDP segment
419  *	-EAGAIN: to be called again to finish the DDP segment
420  */
siw_proc_send(struct siw_qp * qp)421 int siw_proc_send(struct siw_qp *qp)
422 {
423 	struct siw_rx_stream *srx = &qp->rx_stream;
424 	struct siw_rx_fpdu *frx = &qp->rx_untagged;
425 	struct siw_wqe *wqe;
426 	u32 data_bytes; /* all data bytes available */
427 	u32 rcvd_bytes; /* sum of data bytes rcvd */
428 	int rv = 0;
429 
430 	if (frx->first_ddp_seg) {
431 		wqe = siw_rqe_get(qp);
432 		if (unlikely(!wqe)) {
433 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
434 					   DDP_ETYPE_UNTAGGED_BUF,
435 					   DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
436 			return -ENOENT;
437 		}
438 	} else {
439 		wqe = rx_wqe(frx);
440 	}
441 	if (srx->state == SIW_GET_DATA_START) {
442 		rv = siw_send_check_ntoh(srx, frx);
443 		if (unlikely(rv)) {
444 			siw_qp_event(qp, IB_EVENT_QP_FATAL);
445 			return rv;
446 		}
447 		if (!srx->fpdu_part_rem) /* zero length SEND */
448 			return 0;
449 	}
450 	data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
451 	rcvd_bytes = 0;
452 
453 	/* A zero length SEND will skip below loop */
454 	while (data_bytes) {
455 		struct ib_pd *pd;
456 		struct siw_mem **mem, *mem_p;
457 		struct siw_sge *sge;
458 		u32 sge_bytes; /* data bytes avail for SGE */
459 
460 		sge = &wqe->rqe.sge[frx->sge_idx];
461 
462 		if (!sge->length) {
463 			/* just skip empty sge's */
464 			frx->sge_idx++;
465 			frx->sge_off = 0;
466 			frx->pbl_idx = 0;
467 			continue;
468 		}
469 		sge_bytes = min(data_bytes, sge->length - frx->sge_off);
470 		mem = &wqe->mem[frx->sge_idx];
471 
472 		/*
473 		 * check with QP's PD if no SRQ present, SRQ's PD otherwise
474 		 */
475 		pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
476 
477 		rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
478 				   frx->sge_off, sge_bytes);
479 		if (unlikely(rv)) {
480 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
481 					   DDP_ETYPE_CATASTROPHIC,
482 					   DDP_ECODE_CATASTROPHIC, 0);
483 
484 			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
485 			break;
486 		}
487 		mem_p = *mem;
488 		if (mem_p->mem_obj == NULL)
489 			rv = siw_rx_kva(srx,
490 				ib_virt_dma_to_ptr(sge->laddr + frx->sge_off),
491 				sge_bytes);
492 		else if (!mem_p->is_pbl)
493 			rv = siw_rx_umem(srx, mem_p->umem,
494 					 sge->laddr + frx->sge_off, sge_bytes);
495 		else
496 			rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
497 					sge->laddr + frx->sge_off, sge_bytes);
498 
499 		if (unlikely(rv != sge_bytes)) {
500 			wqe->processed += rcvd_bytes;
501 
502 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
503 					   DDP_ETYPE_CATASTROPHIC,
504 					   DDP_ECODE_CATASTROPHIC, 0);
505 			return -EINVAL;
506 		}
507 		frx->sge_off += rv;
508 
509 		if (frx->sge_off == sge->length) {
510 			frx->sge_idx++;
511 			frx->sge_off = 0;
512 			frx->pbl_idx = 0;
513 		}
514 		data_bytes -= rv;
515 		rcvd_bytes += rv;
516 
517 		srx->fpdu_part_rem -= rv;
518 		srx->fpdu_part_rcvd += rv;
519 	}
520 	wqe->processed += rcvd_bytes;
521 
522 	if (!srx->fpdu_part_rem)
523 		return 0;
524 
525 	return (rv < 0) ? rv : -EAGAIN;
526 }
527 
528 /*
529  * siw_proc_write:
530  *
531  * Place incoming WRITE after referencing and checking target buffer
532 
533  * Function supports partially received WRITEs (suspending/resuming
534  * current receive processing)
535  *
536  * return value:
537  *	0:       reached the end of a DDP segment
538  *	-EAGAIN: to be called again to finish the DDP segment
539  */
siw_proc_write(struct siw_qp * qp)540 int siw_proc_write(struct siw_qp *qp)
541 {
542 	struct siw_rx_stream *srx = &qp->rx_stream;
543 	struct siw_rx_fpdu *frx = &qp->rx_tagged;
544 	struct siw_mem *mem;
545 	int bytes, rv;
546 
547 	if (srx->state == SIW_GET_DATA_START) {
548 		if (!srx->fpdu_part_rem) /* zero length WRITE */
549 			return 0;
550 
551 		rv = siw_write_check_ntoh(srx, frx);
552 		if (unlikely(rv)) {
553 			siw_qp_event(qp, IB_EVENT_QP_FATAL);
554 			return rv;
555 		}
556 	}
557 	bytes = min(srx->fpdu_part_rem, srx->skb_new);
558 
559 	if (frx->first_ddp_seg) {
560 		struct siw_wqe *wqe = rx_wqe(frx);
561 
562 		rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
563 		if (unlikely(!rx_mem(frx))) {
564 			siw_dbg_qp(qp,
565 				   "sink stag not found/invalid, stag 0x%08x\n",
566 				   srx->ddp_stag);
567 
568 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
569 					   DDP_ETYPE_TAGGED_BUF,
570 					   DDP_ECODE_T_INVALID_STAG, 0);
571 			return -EINVAL;
572 		}
573 		wqe->rqe.num_sge = 1;
574 		rx_type(wqe) = SIW_OP_WRITE;
575 		wqe->wr_status = SIW_WR_INPROGRESS;
576 	}
577 	mem = rx_mem(frx);
578 
579 	/*
580 	 * Check if application re-registered memory with different
581 	 * key field of STag.
582 	 */
583 	if (unlikely(mem->stag != srx->ddp_stag)) {
584 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
585 				   DDP_ETYPE_TAGGED_BUF,
586 				   DDP_ECODE_T_INVALID_STAG, 0);
587 		return -EINVAL;
588 	}
589 	rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
590 			   IB_ACCESS_REMOTE_WRITE, bytes);
591 	if (unlikely(rv)) {
592 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
593 				   DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
594 				   0);
595 
596 		siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
597 
598 		return -EINVAL;
599 	}
600 
601 	if (mem->mem_obj == NULL)
602 		rv = siw_rx_kva(srx,
603 			(void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd),
604 			bytes);
605 	else if (!mem->is_pbl)
606 		rv = siw_rx_umem(srx, mem->umem,
607 				 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
608 	else
609 		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
610 				srx->ddp_to + srx->fpdu_part_rcvd, bytes);
611 
612 	if (unlikely(rv != bytes)) {
613 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
614 				   DDP_ETYPE_CATASTROPHIC,
615 				   DDP_ECODE_CATASTROPHIC, 0);
616 		return -EINVAL;
617 	}
618 	srx->fpdu_part_rem -= rv;
619 	srx->fpdu_part_rcvd += rv;
620 
621 	if (!srx->fpdu_part_rem) {
622 		srx->ddp_to += srx->fpdu_part_rcvd;
623 		return 0;
624 	}
625 	return -EAGAIN;
626 }
627 
628 /*
629  * Inbound RREQ's cannot carry user data.
630  */
siw_proc_rreq(struct siw_qp * qp)631 int siw_proc_rreq(struct siw_qp *qp)
632 {
633 	struct siw_rx_stream *srx = &qp->rx_stream;
634 
635 	if (!srx->fpdu_part_rem)
636 		return 0;
637 
638 	pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
639 		be16_to_cpu(srx->hdr.ctrl.mpa_len));
640 
641 	return -EPROTO;
642 }
643 
644 /*
645  * siw_init_rresp:
646  *
647  * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
648  * Put it at the tail of the IRQ, if there is another WQE currently in
649  * transmit processing. If not, make it the current WQE to be processed
650  * and schedule transmit processing.
651  *
652  * Can be called from softirq context and from process
653  * context (RREAD socket loopback case!)
654  *
655  * return value:
656  *	0:      success,
657  *		failure code otherwise
658  */
659 
siw_init_rresp(struct siw_qp * qp,struct siw_rx_stream * srx)660 static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
661 {
662 	struct siw_wqe *tx_work = tx_wqe(qp);
663 	struct siw_sqe *resp;
664 
665 	uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
666 		 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
667 	uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
668 		 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
669 		 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
670 		 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
671 
672 	int run_sq = 1, rv = 0;
673 	unsigned long flags;
674 
675 	if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
676 		siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
677 				   DDP_ETYPE_UNTAGGED_BUF,
678 				   DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
679 		return -EPROTO;
680 	}
681 	spin_lock_irqsave(&qp->sq_lock, flags);
682 
683 	if (unlikely(!qp->attrs.irq_size)) {
684 		run_sq = 0;
685 		goto error_irq;
686 	}
687 	if (tx_work->wr_status == SIW_WR_IDLE) {
688 		/*
689 		 * immediately schedule READ response w/o
690 		 * consuming IRQ entry: IRQ must be empty.
691 		 */
692 		tx_work->processed = 0;
693 		tx_work->mem[0] = NULL;
694 		tx_work->wr_status = SIW_WR_QUEUED;
695 		resp = &tx_work->sqe;
696 	} else {
697 		resp = irq_alloc_free(qp);
698 		run_sq = 0;
699 	}
700 	if (likely(resp)) {
701 		resp->opcode = SIW_OP_READ_RESPONSE;
702 
703 		resp->sge[0].length = length;
704 		resp->sge[0].laddr = laddr;
705 		resp->sge[0].lkey = lkey;
706 
707 		/* Keep aside message sequence number for potential
708 		 * error reporting during Read Response generation.
709 		 */
710 		resp->sge[1].length = msn;
711 
712 		resp->raddr = raddr;
713 		resp->rkey = rkey;
714 		resp->num_sge = length ? 1 : 0;
715 
716 		/* RRESP now valid as current TX wqe or placed into IRQ */
717 		smp_store_mb(resp->flags, SIW_WQE_VALID);
718 	} else {
719 error_irq:
720 		pr_warn("siw: [QP %u]: IRQ exceeded or null, size %d\n",
721 			qp_id(qp), qp->attrs.irq_size);
722 
723 		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
724 				   RDMAP_ETYPE_REMOTE_OPERATION,
725 				   RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
726 		rv = -EPROTO;
727 	}
728 
729 	spin_unlock_irqrestore(&qp->sq_lock, flags);
730 
731 	if (run_sq)
732 		rv = siw_sq_start(qp);
733 
734 	return rv;
735 }
736 
737 /*
738  * Only called at start of Read.Resonse processing.
739  * Transfer pending Read from tip of ORQ into currrent rx wqe,
740  * but keep ORQ entry valid until Read.Response processing done.
741  * No Queue locking needed.
742  */
siw_orqe_start_rx(struct siw_qp * qp)743 static int siw_orqe_start_rx(struct siw_qp *qp)
744 {
745 	struct siw_sqe *orqe;
746 	struct siw_wqe *wqe = NULL;
747 
748 	if (unlikely(!qp->attrs.orq_size))
749 		return -EPROTO;
750 
751 	/* make sure ORQ indices are current */
752 	smp_mb();
753 
754 	orqe = orq_get_current(qp);
755 	if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
756 		/* RRESP is a TAGGED RDMAP operation */
757 		wqe = rx_wqe(&qp->rx_tagged);
758 		wqe->sqe.id = orqe->id;
759 		wqe->sqe.opcode = orqe->opcode;
760 		wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
761 		wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
762 		wqe->sqe.sge[0].length = orqe->sge[0].length;
763 		wqe->sqe.flags = orqe->flags;
764 		wqe->sqe.num_sge = 1;
765 		wqe->bytes = orqe->sge[0].length;
766 		wqe->processed = 0;
767 		wqe->mem[0] = NULL;
768 		/* make sure WQE is completely written before valid */
769 		smp_wmb();
770 		wqe->wr_status = SIW_WR_INPROGRESS;
771 
772 		return 0;
773 	}
774 	return -EPROTO;
775 }
776 
777 /*
778  * siw_proc_rresp:
779  *
780  * Place incoming RRESP data into memory referenced by RREQ WQE
781  * which is at the tip of the ORQ
782  *
783  * Function supports partially received RRESP's (suspending/resuming
784  * current receive processing)
785  */
siw_proc_rresp(struct siw_qp * qp)786 int siw_proc_rresp(struct siw_qp *qp)
787 {
788 	struct siw_rx_stream *srx = &qp->rx_stream;
789 	struct siw_rx_fpdu *frx = &qp->rx_tagged;
790 	struct siw_wqe *wqe = rx_wqe(frx);
791 	struct siw_mem **mem, *mem_p;
792 	struct siw_sge *sge;
793 	int bytes, rv;
794 
795 	if (frx->first_ddp_seg) {
796 		if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
797 			pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
798 				qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
799 			rv = -EPROTO;
800 			goto error_term;
801 		}
802 		/*
803 		 * fetch pending RREQ from orq
804 		 */
805 		rv = siw_orqe_start_rx(qp);
806 		if (rv) {
807 			pr_warn("siw: [QP %u]: ORQ empty, size %d\n",
808 				qp_id(qp), qp->attrs.orq_size);
809 			goto error_term;
810 		}
811 		rv = siw_rresp_check_ntoh(srx, frx);
812 		if (unlikely(rv)) {
813 			siw_qp_event(qp, IB_EVENT_QP_FATAL);
814 			return rv;
815 		}
816 	} else {
817 		if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
818 			pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
819 				qp_id(qp), wqe->wr_status);
820 			rv = -EPROTO;
821 			goto error_term;
822 		}
823 	}
824 	if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
825 		return 0;
826 
827 	sge = wqe->sqe.sge; /* there is only one */
828 	mem = &wqe->mem[0];
829 
830 	if (!(*mem)) {
831 		/*
832 		 * check target memory which resolves memory on first fragment
833 		 */
834 		rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
835 				   wqe->bytes);
836 		if (unlikely(rv)) {
837 			siw_dbg_qp(qp, "target mem check: %d\n", rv);
838 			wqe->wc_status = SIW_WC_LOC_PROT_ERR;
839 
840 			siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
841 					   DDP_ETYPE_TAGGED_BUF,
842 					   siw_tagged_error(-rv), 0);
843 
844 			siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
845 
846 			return -EINVAL;
847 		}
848 	}
849 	mem_p = *mem;
850 
851 	bytes = min(srx->fpdu_part_rem, srx->skb_new);
852 
853 	if (mem_p->mem_obj == NULL)
854 		rv = siw_rx_kva(srx,
855 			ib_virt_dma_to_ptr(sge->laddr + wqe->processed),
856 			bytes);
857 	else if (!mem_p->is_pbl)
858 		rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
859 				 bytes);
860 	else
861 		rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
862 				sge->laddr + wqe->processed, bytes);
863 	if (rv != bytes) {
864 		wqe->wc_status = SIW_WC_GENERAL_ERR;
865 		rv = -EINVAL;
866 		goto error_term;
867 	}
868 	srx->fpdu_part_rem -= rv;
869 	srx->fpdu_part_rcvd += rv;
870 	wqe->processed += rv;
871 
872 	if (!srx->fpdu_part_rem) {
873 		srx->ddp_to += srx->fpdu_part_rcvd;
874 		return 0;
875 	}
876 	return -EAGAIN;
877 
878 error_term:
879 	siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
880 			   DDP_ECODE_CATASTROPHIC, 0);
881 	return rv;
882 }
883 
siw_proc_terminate(struct siw_qp * qp)884 int siw_proc_terminate(struct siw_qp *qp)
885 {
886 	struct siw_rx_stream *srx = &qp->rx_stream;
887 	struct sk_buff *skb = srx->skb;
888 	struct iwarp_terminate *term = &srx->hdr.terminate;
889 	union iwarp_hdr term_info;
890 	u8 *infop = (u8 *)&term_info;
891 	enum rdma_opcode op;
892 	u16 to_copy = sizeof(struct iwarp_ctrl);
893 
894 	pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
895 		__rdmap_term_layer(term), __rdmap_term_etype(term),
896 		__rdmap_term_ecode(term));
897 
898 	if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
899 	    be32_to_cpu(term->ddp_msn) !=
900 		    qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
901 	    be32_to_cpu(term->ddp_mo) != 0) {
902 		pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
903 			be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
904 			be32_to_cpu(term->ddp_mo));
905 		return -ECONNRESET;
906 	}
907 	/*
908 	 * Receive remaining pieces of TERM if indicated
909 	 */
910 	if (!term->flag_m)
911 		return -ECONNRESET;
912 
913 	/* Do not take the effort to reassemble a network fragmented
914 	 * TERM message
915 	 */
916 	if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
917 		return -ECONNRESET;
918 
919 	memset(infop, 0, sizeof(term_info));
920 
921 	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
922 
923 	op = __rdmap_get_opcode(&term_info.ctrl);
924 	if (op >= RDMAP_TERMINATE)
925 		goto out;
926 
927 	infop += to_copy;
928 	srx->skb_offset += to_copy;
929 	srx->skb_new -= to_copy;
930 	srx->skb_copied += to_copy;
931 	srx->fpdu_part_rcvd += to_copy;
932 	srx->fpdu_part_rem -= to_copy;
933 
934 	to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
935 
936 	/* Again, no network fragmented TERM's */
937 	if (to_copy + MPA_CRC_SIZE > srx->skb_new)
938 		return -ECONNRESET;
939 
940 	skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
941 
942 	if (term->flag_r) {
943 		siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
944 			   op, be16_to_cpu(term_info.ctrl.mpa_len),
945 			   term->flag_m ? "valid" : "invalid");
946 	} else if (term->flag_d) {
947 		siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
948 			   op, be16_to_cpu(term_info.ctrl.mpa_len),
949 			   term->flag_m ? "valid" : "invalid");
950 	}
951 out:
952 	srx->skb_new -= to_copy;
953 	srx->skb_offset += to_copy;
954 	srx->skb_copied += to_copy;
955 	srx->fpdu_part_rcvd += to_copy;
956 	srx->fpdu_part_rem -= to_copy;
957 
958 	return -ECONNRESET;
959 }
960 
siw_get_trailer(struct siw_qp * qp,struct siw_rx_stream * srx)961 static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
962 {
963 	struct sk_buff *skb = srx->skb;
964 	int avail = min(srx->skb_new, srx->fpdu_part_rem);
965 	u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
966 	__wsum crc_in, crc_own = 0;
967 
968 	siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
969 		   srx->fpdu_part_rem, srx->skb_new, srx->pad);
970 
971 	skb_copy_bits(skb, srx->skb_offset, tbuf, avail);
972 
973 	srx->skb_new -= avail;
974 	srx->skb_offset += avail;
975 	srx->skb_copied += avail;
976 	srx->fpdu_part_rem -= avail;
977 
978 	if (srx->fpdu_part_rem)
979 		return -EAGAIN;
980 
981 	if (!srx->mpa_crc_hd)
982 		return 0;
983 
984 	if (srx->pad)
985 		crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
986 	/*
987 	 * CRC32 is computed, transmitted and received directly in NBO,
988 	 * so there's never a reason to convert byte order.
989 	 */
990 	crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
991 	crc_in = (__force __wsum)srx->trailer.crc;
992 
993 	if (unlikely(crc_in != crc_own)) {
994 		pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
995 			crc_in, crc_own, qp->rx_stream.rdmap_op);
996 
997 		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
998 				   LLP_ETYPE_MPA,
999 				   LLP_ECODE_RECEIVED_CRC, 0);
1000 		return -EINVAL;
1001 	}
1002 	return 0;
1003 }
1004 
1005 #define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
1006 
siw_get_hdr(struct siw_rx_stream * srx)1007 static int siw_get_hdr(struct siw_rx_stream *srx)
1008 {
1009 	struct sk_buff *skb = srx->skb;
1010 	struct siw_qp *qp = rx_qp(srx);
1011 	struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
1012 	struct siw_rx_fpdu *frx;
1013 	u8 opcode;
1014 	int bytes;
1015 
1016 	if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
1017 		/*
1018 		 * copy a mimimum sized (tagged) DDP frame control part
1019 		 */
1020 		bytes = min_t(int, srx->skb_new,
1021 			      MIN_DDP_HDR - srx->fpdu_part_rcvd);
1022 
1023 		skb_copy_bits(skb, srx->skb_offset,
1024 			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1025 
1026 		srx->fpdu_part_rcvd += bytes;
1027 
1028 		srx->skb_new -= bytes;
1029 		srx->skb_offset += bytes;
1030 		srx->skb_copied += bytes;
1031 
1032 		if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
1033 			return -EAGAIN;
1034 
1035 		if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
1036 			enum ddp_etype etype;
1037 			enum ddp_ecode ecode;
1038 
1039 			pr_warn("siw: received ddp version unsupported %d\n",
1040 				__ddp_get_version(c_hdr));
1041 
1042 			if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
1043 				etype = DDP_ETYPE_TAGGED_BUF;
1044 				ecode = DDP_ECODE_T_VERSION;
1045 			} else {
1046 				etype = DDP_ETYPE_UNTAGGED_BUF;
1047 				ecode = DDP_ECODE_UT_VERSION;
1048 			}
1049 			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
1050 					   etype, ecode, 0);
1051 			return -EINVAL;
1052 		}
1053 		if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
1054 			pr_warn("siw: received rdmap version unsupported %d\n",
1055 				__rdmap_get_version(c_hdr));
1056 
1057 			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1058 					   RDMAP_ETYPE_REMOTE_OPERATION,
1059 					   RDMAP_ECODE_VERSION, 0);
1060 			return -EINVAL;
1061 		}
1062 		opcode = __rdmap_get_opcode(c_hdr);
1063 
1064 		if (opcode > RDMAP_TERMINATE) {
1065 			pr_warn("siw: received unknown packet type %u\n",
1066 				opcode);
1067 
1068 			siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1069 					   RDMAP_ETYPE_REMOTE_OPERATION,
1070 					   RDMAP_ECODE_OPCODE, 0);
1071 			return -EINVAL;
1072 		}
1073 		siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
1074 	} else {
1075 		opcode = __rdmap_get_opcode(c_hdr);
1076 	}
1077 	set_rx_fpdu_context(qp, opcode);
1078 	frx = qp->rx_fpdu;
1079 
1080 	/*
1081 	 * Figure out len of current hdr: variable length of
1082 	 * iwarp hdr may force us to copy hdr information in
1083 	 * two steps. Only tagged DDP messages are already
1084 	 * completely received.
1085 	 */
1086 	if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
1087 		int hdrlen = iwarp_pktinfo[opcode].hdr_len;
1088 
1089 		bytes = min_t(int, hdrlen - MIN_DDP_HDR, srx->skb_new);
1090 
1091 		skb_copy_bits(skb, srx->skb_offset,
1092 			      (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1093 
1094 		srx->fpdu_part_rcvd += bytes;
1095 
1096 		srx->skb_new -= bytes;
1097 		srx->skb_offset += bytes;
1098 		srx->skb_copied += bytes;
1099 
1100 		if (srx->fpdu_part_rcvd < hdrlen)
1101 			return -EAGAIN;
1102 	}
1103 
1104 	/*
1105 	 * DDP/RDMAP header receive completed. Check if the current
1106 	 * DDP segment starts a new RDMAP message or continues a previously
1107 	 * started RDMAP message.
1108 	 *
1109 	 * Alternating reception of DDP segments (or FPDUs) from incomplete
1110 	 * tagged and untagged RDMAP messages is supported, as long as
1111 	 * the current tagged or untagged message gets eventually completed
1112 	 * w/o intersection from another message of the same type
1113 	 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
1114 	 * but not by a READ RESPONSE etc.
1115 	 */
1116 	if (srx->mpa_crc_hd) {
1117 		/*
1118 		 * Restart CRC computation
1119 		 */
1120 		crypto_shash_init(srx->mpa_crc_hd);
1121 		crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
1122 				    srx->fpdu_part_rcvd);
1123 	}
1124 	if (frx->more_ddp_segs) {
1125 		frx->first_ddp_seg = 0;
1126 		if (frx->prev_rdmap_op != opcode) {
1127 			pr_warn("siw: packet intersection: %u : %u\n",
1128 				frx->prev_rdmap_op, opcode);
1129 			/*
1130 			 * The last inbound RDMA operation of same type
1131 			 * (tagged or untagged) is left unfinished.
1132 			 * To complete it in error, make it the current
1133 			 * operation again, even with the header already
1134 			 * overwritten. For error handling, only the opcode
1135 			 * and current rx context are relevant.
1136 			 */
1137 			set_rx_fpdu_context(qp, frx->prev_rdmap_op);
1138 			__rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
1139 			return -EPROTO;
1140 		}
1141 	} else {
1142 		frx->prev_rdmap_op = opcode;
1143 		frx->first_ddp_seg = 1;
1144 	}
1145 	frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
1146 
1147 	return 0;
1148 }
1149 
siw_check_tx_fence(struct siw_qp * qp)1150 static int siw_check_tx_fence(struct siw_qp *qp)
1151 {
1152 	struct siw_wqe *tx_waiting = tx_wqe(qp);
1153 	struct siw_sqe *rreq;
1154 	int resume_tx = 0, rv = 0;
1155 	unsigned long flags;
1156 
1157 	spin_lock_irqsave(&qp->orq_lock, flags);
1158 
1159 	/* free current orq entry */
1160 	rreq = orq_get_current(qp);
1161 	WRITE_ONCE(rreq->flags, 0);
1162 
1163 	qp->orq_get++;
1164 
1165 	if (qp->tx_ctx.orq_fence) {
1166 		if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
1167 			pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
1168 				qp_id(qp), tx_waiting->wr_status);
1169 			rv = -EPROTO;
1170 			goto out;
1171 		}
1172 		/* resume SQ processing, if possible */
1173 		if (tx_waiting->sqe.opcode == SIW_OP_READ ||
1174 		    tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
1175 
1176 			/* SQ processing was stopped because of a full ORQ */
1177 			rreq = orq_get_free(qp);
1178 			if (unlikely(!rreq)) {
1179 				pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
1180 				rv = -EPROTO;
1181 				goto out;
1182 			}
1183 			siw_read_to_orq(rreq, &tx_waiting->sqe);
1184 
1185 			qp->orq_put++;
1186 			qp->tx_ctx.orq_fence = 0;
1187 			resume_tx = 1;
1188 
1189 		} else if (siw_orq_empty(qp)) {
1190 			/*
1191 			 * SQ processing was stopped by fenced work request.
1192 			 * Resume since all previous Read's are now completed.
1193 			 */
1194 			qp->tx_ctx.orq_fence = 0;
1195 			resume_tx = 1;
1196 		}
1197 	}
1198 out:
1199 	spin_unlock_irqrestore(&qp->orq_lock, flags);
1200 
1201 	if (resume_tx)
1202 		rv = siw_sq_start(qp);
1203 
1204 	return rv;
1205 }
1206 
1207 /*
1208  * siw_rdmap_complete()
1209  *
1210  * Complete processing of an RDMA message after receiving all
1211  * DDP segmens or ABort processing after encountering error case.
1212  *
1213  *   o SENDs + RRESPs will need for completion,
1214  *   o RREQs need for  READ RESPONSE initialization
1215  *   o WRITEs need memory dereferencing
1216  *
1217  * TODO: Failed WRITEs need local error to be surfaced.
1218  */
siw_rdmap_complete(struct siw_qp * qp,int error)1219 static int siw_rdmap_complete(struct siw_qp *qp, int error)
1220 {
1221 	struct siw_rx_stream *srx = &qp->rx_stream;
1222 	struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
1223 	enum siw_wc_status wc_status = wqe->wc_status;
1224 	u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
1225 	int rv = 0;
1226 
1227 	switch (opcode) {
1228 	case RDMAP_SEND_SE:
1229 	case RDMAP_SEND_SE_INVAL:
1230 		wqe->rqe.flags |= SIW_WQE_SOLICITED;
1231 		fallthrough;
1232 
1233 	case RDMAP_SEND:
1234 	case RDMAP_SEND_INVAL:
1235 		if (wqe->wr_status == SIW_WR_IDLE)
1236 			break;
1237 
1238 		srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
1239 
1240 		if (error != 0 && wc_status == SIW_WC_SUCCESS)
1241 			wc_status = SIW_WC_GENERAL_ERR;
1242 		/*
1243 		 * Handle STag invalidation request
1244 		 */
1245 		if (wc_status == SIW_WC_SUCCESS &&
1246 		    (opcode == RDMAP_SEND_INVAL ||
1247 		     opcode == RDMAP_SEND_SE_INVAL)) {
1248 			rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
1249 			if (rv) {
1250 				siw_init_terminate(
1251 					qp, TERM_ERROR_LAYER_RDMAP,
1252 					rv == -EACCES ?
1253 						RDMAP_ETYPE_REMOTE_PROTECTION :
1254 						RDMAP_ETYPE_REMOTE_OPERATION,
1255 					RDMAP_ECODE_CANNOT_INVALIDATE, 0);
1256 
1257 				wc_status = SIW_WC_REM_INV_REQ_ERR;
1258 			}
1259 			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1260 					      rv ? 0 : srx->inval_stag,
1261 					      wc_status);
1262 		} else {
1263 			rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1264 					      0, wc_status);
1265 		}
1266 		siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
1267 		break;
1268 
1269 	case RDMAP_RDMA_READ_RESP:
1270 		if (wqe->wr_status == SIW_WR_IDLE)
1271 			break;
1272 
1273 		if (error != 0) {
1274 			if ((srx->state == SIW_GET_HDR &&
1275 			     qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
1276 				/* possible RREQ in ORQ left untouched */
1277 				break;
1278 
1279 			if (wc_status == SIW_WC_SUCCESS)
1280 				wc_status = SIW_WC_GENERAL_ERR;
1281 		} else if (rdma_is_kernel_res(&qp->base_qp.res) &&
1282 			   rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
1283 			/*
1284 			 * Handle any STag invalidation request
1285 			 */
1286 			rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
1287 			if (rv) {
1288 				siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
1289 						   RDMAP_ETYPE_CATASTROPHIC,
1290 						   RDMAP_ECODE_UNSPECIFIED, 0);
1291 
1292 				if (wc_status == SIW_WC_SUCCESS) {
1293 					wc_status = SIW_WC_GENERAL_ERR;
1294 					error = rv;
1295 				}
1296 			}
1297 		}
1298 		/*
1299 		 * All errors turn the wqe into signalled.
1300 		 */
1301 		if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
1302 			rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
1303 					      wc_status);
1304 		siw_wqe_put_mem(wqe, SIW_OP_READ);
1305 
1306 		if (!error) {
1307 			rv = siw_check_tx_fence(qp);
1308 		} else {
1309 			/* Disable current ORQ element */
1310 			if (qp->attrs.orq_size)
1311 				WRITE_ONCE(orq_get_current(qp)->flags, 0);
1312 		}
1313 		break;
1314 
1315 	case RDMAP_RDMA_READ_REQ:
1316 		if (!error) {
1317 			rv = siw_init_rresp(qp, srx);
1318 			srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
1319 		}
1320 		break;
1321 
1322 	case RDMAP_RDMA_WRITE:
1323 		if (wqe->wr_status == SIW_WR_IDLE)
1324 			break;
1325 
1326 		/*
1327 		 * Free References from memory object if
1328 		 * attached to receive context (inbound WRITE).
1329 		 * While a zero-length WRITE is allowed,
1330 		 * no memory reference got created.
1331 		 */
1332 		if (rx_mem(&qp->rx_tagged)) {
1333 			siw_mem_put(rx_mem(&qp->rx_tagged));
1334 			rx_mem(&qp->rx_tagged) = NULL;
1335 		}
1336 		break;
1337 
1338 	default:
1339 		break;
1340 	}
1341 	wqe->wr_status = SIW_WR_IDLE;
1342 
1343 	return rv;
1344 }
1345 
1346 /*
1347  * siw_tcp_rx_data()
1348  *
1349  * Main routine to consume inbound TCP payload
1350  *
1351  * @rd_desc:	read descriptor
1352  * @skb:	socket buffer
1353  * @off:	offset in skb
1354  * @len:	skb->len - offset : payload in skb
1355  */
siw_tcp_rx_data(read_descriptor_t * rd_desc,struct sk_buff * skb,unsigned int off,size_t len)1356 int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
1357 		    unsigned int off, size_t len)
1358 {
1359 	struct siw_qp *qp = rd_desc->arg.data;
1360 	struct siw_rx_stream *srx = &qp->rx_stream;
1361 	int rv;
1362 
1363 	srx->skb = skb;
1364 	srx->skb_new = skb->len - off;
1365 	srx->skb_offset = off;
1366 	srx->skb_copied = 0;
1367 
1368 	siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
1369 
1370 	while (srx->skb_new) {
1371 		int run_completion = 1;
1372 
1373 		if (unlikely(srx->rx_suspend)) {
1374 			/* Do not process any more data */
1375 			srx->skb_copied += srx->skb_new;
1376 			break;
1377 		}
1378 		switch (srx->state) {
1379 		case SIW_GET_HDR:
1380 			rv = siw_get_hdr(srx);
1381 			if (!rv) {
1382 				srx->fpdu_part_rem =
1383 					be16_to_cpu(srx->hdr.ctrl.mpa_len) -
1384 					srx->fpdu_part_rcvd + MPA_HDR_SIZE;
1385 
1386 				if (srx->fpdu_part_rem)
1387 					srx->pad = -srx->fpdu_part_rem & 0x3;
1388 				else
1389 					srx->pad = 0;
1390 
1391 				srx->state = SIW_GET_DATA_START;
1392 				srx->fpdu_part_rcvd = 0;
1393 			}
1394 			break;
1395 
1396 		case SIW_GET_DATA_MORE:
1397 			/*
1398 			 * Another data fragment of the same DDP segment.
1399 			 * Setting first_ddp_seg = 0 avoids repeating
1400 			 * initializations that shall occur only once per
1401 			 * DDP segment.
1402 			 */
1403 			qp->rx_fpdu->first_ddp_seg = 0;
1404 			fallthrough;
1405 
1406 		case SIW_GET_DATA_START:
1407 			/*
1408 			 * Headers will be checked by the opcode-specific
1409 			 * data receive function below.
1410 			 */
1411 			rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
1412 			if (!rv) {
1413 				int mpa_len =
1414 					be16_to_cpu(srx->hdr.ctrl.mpa_len)
1415 					+ MPA_HDR_SIZE;
1416 
1417 				srx->fpdu_part_rem = (-mpa_len & 0x3)
1418 						      + MPA_CRC_SIZE;
1419 				srx->fpdu_part_rcvd = 0;
1420 				srx->state = SIW_GET_TRAILER;
1421 			} else {
1422 				if (unlikely(rv == -ECONNRESET))
1423 					run_completion = 0;
1424 				else
1425 					srx->state = SIW_GET_DATA_MORE;
1426 			}
1427 			break;
1428 
1429 		case SIW_GET_TRAILER:
1430 			/*
1431 			 * read CRC + any padding
1432 			 */
1433 			rv = siw_get_trailer(qp, srx);
1434 			if (likely(!rv)) {
1435 				/*
1436 				 * FPDU completed.
1437 				 * complete RDMAP message if last fragment
1438 				 */
1439 				srx->state = SIW_GET_HDR;
1440 				srx->fpdu_part_rcvd = 0;
1441 
1442 				if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
1443 				      DDP_FLAG_LAST))
1444 					/* more frags */
1445 					break;
1446 
1447 				rv = siw_rdmap_complete(qp, 0);
1448 				run_completion = 0;
1449 			}
1450 			break;
1451 
1452 		default:
1453 			pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
1454 			rv = -EPROTO;
1455 			run_completion = 0;
1456 		}
1457 		if (unlikely(rv != 0 && rv != -EAGAIN)) {
1458 			if ((srx->state > SIW_GET_HDR ||
1459 			     qp->rx_fpdu->more_ddp_segs) && run_completion)
1460 				siw_rdmap_complete(qp, rv);
1461 
1462 			siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
1463 				   srx->state);
1464 
1465 			siw_qp_cm_drop(qp, 1);
1466 
1467 			break;
1468 		}
1469 		if (rv) {
1470 			siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
1471 				   srx->state, srx->fpdu_part_rem);
1472 			break;
1473 		}
1474 	}
1475 	return srx->skb_copied;
1476 }
1477