1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/net.h>
9 #include <linux/scatterlist.h>
10 #include <linux/highmem.h>
11
12 #include <rdma/iw_cm.h>
13 #include <rdma/ib_verbs.h>
14
15 #include "siw.h"
16 #include "siw_verbs.h"
17 #include "siw_mem.h"
18
19 /*
20 * siw_rx_umem()
21 *
22 * Receive data of @len into target referenced by @dest_addr.
23 *
24 * @srx: Receive Context
25 * @umem: siw representation of target memory
26 * @dest_addr: user virtual address
27 * @len: number of bytes to place
28 */
siw_rx_umem(struct siw_rx_stream * srx,struct siw_umem * umem,u64 dest_addr,int len)29 static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
30 u64 dest_addr, int len)
31 {
32 int copied = 0;
33
34 while (len) {
35 struct page *p;
36 int pg_off, bytes, rv;
37 void *dest;
38
39 p = siw_get_upage(umem, dest_addr);
40 if (unlikely(!p)) {
41 pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
42 __func__, qp_id(rx_qp(srx)),
43 (void *)(uintptr_t)dest_addr,
44 (void *)(uintptr_t)umem->fp_addr);
45 /* siw internal error */
46 srx->skb_copied += copied;
47 srx->skb_new -= copied;
48
49 return -EFAULT;
50 }
51 pg_off = dest_addr & ~PAGE_MASK;
52 bytes = min(len, (int)PAGE_SIZE - pg_off);
53
54 siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
55
56 dest = kmap_atomic(p);
57 rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
58 bytes);
59
60 if (unlikely(rv)) {
61 kunmap_atomic(dest);
62 srx->skb_copied += copied;
63 srx->skb_new -= copied;
64
65 pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
66 qp_id(rx_qp(srx)), __func__, len, p, rv);
67
68 return -EFAULT;
69 }
70 if (srx->mpa_crc_hd) {
71 if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) {
72 crypto_shash_update(srx->mpa_crc_hd,
73 (u8 *)(dest + pg_off), bytes);
74 kunmap_atomic(dest);
75 } else {
76 kunmap_atomic(dest);
77 /*
78 * Do CRC on original, not target buffer.
79 * Some user land applications may
80 * concurrently write the target buffer,
81 * which would yield a broken CRC.
82 * Walking the skb twice is very ineffcient.
83 * Folding the CRC into skb_copy_bits()
84 * would be much better, but is currently
85 * not supported.
86 */
87 siw_crc_skb(srx, bytes);
88 }
89 } else {
90 kunmap_atomic(dest);
91 }
92 srx->skb_offset += bytes;
93 copied += bytes;
94 len -= bytes;
95 dest_addr += bytes;
96 pg_off = 0;
97 }
98 srx->skb_copied += copied;
99 srx->skb_new -= copied;
100
101 return copied;
102 }
103
siw_rx_kva(struct siw_rx_stream * srx,void * kva,int len)104 static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
105 {
106 int rv;
107
108 siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
109
110 rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
111 if (unlikely(rv)) {
112 pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
113 qp_id(rx_qp(srx)), __func__, len, kva, rv);
114
115 return rv;
116 }
117 if (srx->mpa_crc_hd)
118 crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
119
120 srx->skb_offset += len;
121 srx->skb_copied += len;
122 srx->skb_new -= len;
123
124 return len;
125 }
126
siw_rx_pbl(struct siw_rx_stream * srx,int * pbl_idx,struct siw_mem * mem,u64 addr,int len)127 static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
128 struct siw_mem *mem, u64 addr, int len)
129 {
130 struct siw_pbl *pbl = mem->pbl;
131 u64 offset = addr - mem->va;
132 int copied = 0;
133
134 while (len) {
135 int bytes;
136 dma_addr_t buf_addr =
137 siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
138 if (!buf_addr)
139 break;
140
141 bytes = min(bytes, len);
142 if (siw_rx_kva(srx, (void *)(uintptr_t)buf_addr, bytes) ==
143 bytes) {
144 copied += bytes;
145 offset += bytes;
146 len -= bytes;
147 } else {
148 break;
149 }
150 }
151 return copied;
152 }
153
154 /*
155 * siw_rresp_check_ntoh()
156 *
157 * Check incoming RRESP fragment header against expected
158 * header values and update expected values for potential next
159 * fragment.
160 *
161 * NOTE: This function must be called only if a RRESP DDP segment
162 * starts but not for fragmented consecutive pieces of an
163 * already started DDP segment.
164 */
siw_rresp_check_ntoh(struct siw_rx_stream * srx,struct siw_rx_fpdu * frx)165 static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
166 struct siw_rx_fpdu *frx)
167 {
168 struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
169 struct siw_wqe *wqe = &frx->wqe_active;
170 enum ddp_ecode ecode;
171
172 u32 sink_stag = be32_to_cpu(rresp->sink_stag);
173 u64 sink_to = be64_to_cpu(rresp->sink_to);
174
175 if (frx->first_ddp_seg) {
176 srx->ddp_stag = wqe->sqe.sge[0].lkey;
177 srx->ddp_to = wqe->sqe.sge[0].laddr;
178 frx->pbl_idx = 0;
179 }
180 /* Below checks extend beyond the semantics of DDP, and
181 * into RDMAP:
182 * We check if the read response matches exactly the
183 * read request which was send to the remote peer to
184 * trigger this read response. RFC5040/5041 do not
185 * always have a proper error code for the detected
186 * error cases. We choose 'base or bounds error' for
187 * cases where the inbound STag is valid, but offset
188 * or length do not match our response receive state.
189 */
190 if (unlikely(srx->ddp_stag != sink_stag)) {
191 pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
192 qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
193 ecode = DDP_ECODE_T_INVALID_STAG;
194 goto error;
195 }
196 if (unlikely(srx->ddp_to != sink_to)) {
197 pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
198 qp_id(rx_qp(srx)), (unsigned long long)sink_to,
199 (unsigned long long)srx->ddp_to);
200 ecode = DDP_ECODE_T_BASE_BOUNDS;
201 goto error;
202 }
203 if (unlikely(!frx->more_ddp_segs &&
204 (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
205 pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
206 qp_id(rx_qp(srx)),
207 wqe->processed + srx->fpdu_part_rem, wqe->bytes);
208 ecode = DDP_ECODE_T_BASE_BOUNDS;
209 goto error;
210 }
211 return 0;
212 error:
213 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
214 DDP_ETYPE_TAGGED_BUF, ecode, 0);
215 return -EINVAL;
216 }
217
218 /*
219 * siw_write_check_ntoh()
220 *
221 * Check incoming WRITE fragment header against expected
222 * header values and update expected values for potential next
223 * fragment
224 *
225 * NOTE: This function must be called only if a WRITE DDP segment
226 * starts but not for fragmented consecutive pieces of an
227 * already started DDP segment.
228 */
siw_write_check_ntoh(struct siw_rx_stream * srx,struct siw_rx_fpdu * frx)229 static int siw_write_check_ntoh(struct siw_rx_stream *srx,
230 struct siw_rx_fpdu *frx)
231 {
232 struct iwarp_rdma_write *write = &srx->hdr.rwrite;
233 enum ddp_ecode ecode;
234
235 u32 sink_stag = be32_to_cpu(write->sink_stag);
236 u64 sink_to = be64_to_cpu(write->sink_to);
237
238 if (frx->first_ddp_seg) {
239 srx->ddp_stag = sink_stag;
240 srx->ddp_to = sink_to;
241 frx->pbl_idx = 0;
242 } else {
243 if (unlikely(srx->ddp_stag != sink_stag)) {
244 pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
245 qp_id(rx_qp(srx)), sink_stag,
246 srx->ddp_stag);
247 ecode = DDP_ECODE_T_INVALID_STAG;
248 goto error;
249 }
250 if (unlikely(srx->ddp_to != sink_to)) {
251 pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
252 qp_id(rx_qp(srx)),
253 (unsigned long long)sink_to,
254 (unsigned long long)srx->ddp_to);
255 ecode = DDP_ECODE_T_BASE_BOUNDS;
256 goto error;
257 }
258 }
259 return 0;
260 error:
261 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
262 DDP_ETYPE_TAGGED_BUF, ecode, 0);
263 return -EINVAL;
264 }
265
266 /*
267 * siw_send_check_ntoh()
268 *
269 * Check incoming SEND fragment header against expected
270 * header values and update expected MSN if no next
271 * fragment expected
272 *
273 * NOTE: This function must be called only if a SEND DDP segment
274 * starts but not for fragmented consecutive pieces of an
275 * already started DDP segment.
276 */
siw_send_check_ntoh(struct siw_rx_stream * srx,struct siw_rx_fpdu * frx)277 static int siw_send_check_ntoh(struct siw_rx_stream *srx,
278 struct siw_rx_fpdu *frx)
279 {
280 struct iwarp_send_inv *send = &srx->hdr.send_inv;
281 struct siw_wqe *wqe = &frx->wqe_active;
282 enum ddp_ecode ecode;
283
284 u32 ddp_msn = be32_to_cpu(send->ddp_msn);
285 u32 ddp_mo = be32_to_cpu(send->ddp_mo);
286 u32 ddp_qn = be32_to_cpu(send->ddp_qn);
287
288 if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
289 pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
290 qp_id(rx_qp(srx)), ddp_qn);
291 ecode = DDP_ECODE_UT_INVALID_QN;
292 goto error;
293 }
294 if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
295 pr_warn("siw: [QP %u]: send msn: %u != %u\n",
296 qp_id(rx_qp(srx)), ddp_msn,
297 srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
298 ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
299 goto error;
300 }
301 if (unlikely(ddp_mo != wqe->processed)) {
302 pr_warn("siw: [QP %u], send mo: %u != %u\n",
303 qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
304 ecode = DDP_ECODE_UT_INVALID_MO;
305 goto error;
306 }
307 if (frx->first_ddp_seg) {
308 /* initialize user memory write position */
309 frx->sge_idx = 0;
310 frx->sge_off = 0;
311 frx->pbl_idx = 0;
312
313 /* only valid for SEND_INV and SEND_SE_INV operations */
314 srx->inval_stag = be32_to_cpu(send->inval_stag);
315 }
316 if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
317 siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
318 wqe->bytes, wqe->processed, srx->fpdu_part_rem);
319 wqe->wc_status = SIW_WC_LOC_LEN_ERR;
320 ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
321 goto error;
322 }
323 return 0;
324 error:
325 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
326 DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
327 return -EINVAL;
328 }
329
siw_rqe_get(struct siw_qp * qp)330 static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
331 {
332 struct siw_rqe *rqe;
333 struct siw_srq *srq;
334 struct siw_wqe *wqe = NULL;
335 bool srq_event = false;
336 unsigned long flags;
337
338 srq = qp->srq;
339 if (srq) {
340 spin_lock_irqsave(&srq->lock, flags);
341 if (unlikely(!srq->num_rqe))
342 goto out;
343
344 rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
345 } else {
346 if (unlikely(!qp->recvq))
347 goto out;
348
349 rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
350 }
351 if (likely(rqe->flags == SIW_WQE_VALID)) {
352 int num_sge = rqe->num_sge;
353
354 if (likely(num_sge <= SIW_MAX_SGE)) {
355 int i = 0;
356
357 wqe = rx_wqe(&qp->rx_untagged);
358 rx_type(wqe) = SIW_OP_RECEIVE;
359 wqe->wr_status = SIW_WR_INPROGRESS;
360 wqe->bytes = 0;
361 wqe->processed = 0;
362
363 wqe->rqe.id = rqe->id;
364 wqe->rqe.num_sge = num_sge;
365
366 while (i < num_sge) {
367 wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
368 wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
369 wqe->rqe.sge[i].length = rqe->sge[i].length;
370 wqe->bytes += wqe->rqe.sge[i].length;
371 wqe->mem[i] = NULL;
372 i++;
373 }
374 /* can be re-used by appl */
375 smp_store_mb(rqe->flags, 0);
376 } else {
377 siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
378 if (srq)
379 spin_unlock_irqrestore(&srq->lock, flags);
380 return NULL;
381 }
382 if (!srq) {
383 qp->rq_get++;
384 } else {
385 if (srq->armed) {
386 /* Test SRQ limit */
387 u32 off = (srq->rq_get + srq->limit) %
388 srq->num_rqe;
389 struct siw_rqe *rqe2 = &srq->recvq[off];
390
391 if (!(rqe2->flags & SIW_WQE_VALID)) {
392 srq->armed = false;
393 srq_event = true;
394 }
395 }
396 srq->rq_get++;
397 }
398 }
399 out:
400 if (srq) {
401 spin_unlock_irqrestore(&srq->lock, flags);
402 if (srq_event)
403 siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
404 }
405 return wqe;
406 }
407
408 /*
409 * siw_proc_send:
410 *
411 * Process one incoming SEND and place data into memory referenced by
412 * receive wqe.
413 *
414 * Function supports partially received sends (suspending/resuming
415 * current receive wqe processing)
416 *
417 * return value:
418 * 0: reached the end of a DDP segment
419 * -EAGAIN: to be called again to finish the DDP segment
420 */
siw_proc_send(struct siw_qp * qp)421 int siw_proc_send(struct siw_qp *qp)
422 {
423 struct siw_rx_stream *srx = &qp->rx_stream;
424 struct siw_rx_fpdu *frx = &qp->rx_untagged;
425 struct siw_wqe *wqe;
426 u32 data_bytes; /* all data bytes available */
427 u32 rcvd_bytes; /* sum of data bytes rcvd */
428 int rv = 0;
429
430 if (frx->first_ddp_seg) {
431 wqe = siw_rqe_get(qp);
432 if (unlikely(!wqe)) {
433 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
434 DDP_ETYPE_UNTAGGED_BUF,
435 DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
436 return -ENOENT;
437 }
438 } else {
439 wqe = rx_wqe(frx);
440 }
441 if (srx->state == SIW_GET_DATA_START) {
442 rv = siw_send_check_ntoh(srx, frx);
443 if (unlikely(rv)) {
444 siw_qp_event(qp, IB_EVENT_QP_FATAL);
445 return rv;
446 }
447 if (!srx->fpdu_part_rem) /* zero length SEND */
448 return 0;
449 }
450 data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
451 rcvd_bytes = 0;
452
453 /* A zero length SEND will skip below loop */
454 while (data_bytes) {
455 struct ib_pd *pd;
456 struct siw_mem **mem, *mem_p;
457 struct siw_sge *sge;
458 u32 sge_bytes; /* data bytes avail for SGE */
459
460 sge = &wqe->rqe.sge[frx->sge_idx];
461
462 if (!sge->length) {
463 /* just skip empty sge's */
464 frx->sge_idx++;
465 frx->sge_off = 0;
466 frx->pbl_idx = 0;
467 continue;
468 }
469 sge_bytes = min(data_bytes, sge->length - frx->sge_off);
470 mem = &wqe->mem[frx->sge_idx];
471
472 /*
473 * check with QP's PD if no SRQ present, SRQ's PD otherwise
474 */
475 pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
476
477 rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
478 frx->sge_off, sge_bytes);
479 if (unlikely(rv)) {
480 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
481 DDP_ETYPE_CATASTROPHIC,
482 DDP_ECODE_CATASTROPHIC, 0);
483
484 siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
485 break;
486 }
487 mem_p = *mem;
488 if (mem_p->mem_obj == NULL)
489 rv = siw_rx_kva(srx,
490 (void *)(uintptr_t)(sge->laddr + frx->sge_off),
491 sge_bytes);
492 else if (!mem_p->is_pbl)
493 rv = siw_rx_umem(srx, mem_p->umem,
494 sge->laddr + frx->sge_off, sge_bytes);
495 else
496 rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
497 sge->laddr + frx->sge_off, sge_bytes);
498
499 if (unlikely(rv != sge_bytes)) {
500 wqe->processed += rcvd_bytes;
501
502 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
503 DDP_ETYPE_CATASTROPHIC,
504 DDP_ECODE_CATASTROPHIC, 0);
505 return -EINVAL;
506 }
507 frx->sge_off += rv;
508
509 if (frx->sge_off == sge->length) {
510 frx->sge_idx++;
511 frx->sge_off = 0;
512 frx->pbl_idx = 0;
513 }
514 data_bytes -= rv;
515 rcvd_bytes += rv;
516
517 srx->fpdu_part_rem -= rv;
518 srx->fpdu_part_rcvd += rv;
519 }
520 wqe->processed += rcvd_bytes;
521
522 if (!srx->fpdu_part_rem)
523 return 0;
524
525 return (rv < 0) ? rv : -EAGAIN;
526 }
527
528 /*
529 * siw_proc_write:
530 *
531 * Place incoming WRITE after referencing and checking target buffer
532
533 * Function supports partially received WRITEs (suspending/resuming
534 * current receive processing)
535 *
536 * return value:
537 * 0: reached the end of a DDP segment
538 * -EAGAIN: to be called again to finish the DDP segment
539 */
siw_proc_write(struct siw_qp * qp)540 int siw_proc_write(struct siw_qp *qp)
541 {
542 struct siw_rx_stream *srx = &qp->rx_stream;
543 struct siw_rx_fpdu *frx = &qp->rx_tagged;
544 struct siw_mem *mem;
545 int bytes, rv;
546
547 if (srx->state == SIW_GET_DATA_START) {
548 if (!srx->fpdu_part_rem) /* zero length WRITE */
549 return 0;
550
551 rv = siw_write_check_ntoh(srx, frx);
552 if (unlikely(rv)) {
553 siw_qp_event(qp, IB_EVENT_QP_FATAL);
554 return rv;
555 }
556 }
557 bytes = min(srx->fpdu_part_rem, srx->skb_new);
558
559 if (frx->first_ddp_seg) {
560 struct siw_wqe *wqe = rx_wqe(frx);
561
562 rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
563 if (unlikely(!rx_mem(frx))) {
564 siw_dbg_qp(qp,
565 "sink stag not found/invalid, stag 0x%08x\n",
566 srx->ddp_stag);
567
568 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
569 DDP_ETYPE_TAGGED_BUF,
570 DDP_ECODE_T_INVALID_STAG, 0);
571 return -EINVAL;
572 }
573 wqe->rqe.num_sge = 1;
574 rx_type(wqe) = SIW_OP_WRITE;
575 wqe->wr_status = SIW_WR_INPROGRESS;
576 }
577 mem = rx_mem(frx);
578
579 /*
580 * Check if application re-registered memory with different
581 * key field of STag.
582 */
583 if (unlikely(mem->stag != srx->ddp_stag)) {
584 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
585 DDP_ETYPE_TAGGED_BUF,
586 DDP_ECODE_T_INVALID_STAG, 0);
587 return -EINVAL;
588 }
589 rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
590 IB_ACCESS_REMOTE_WRITE, bytes);
591 if (unlikely(rv)) {
592 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
593 DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
594 0);
595
596 siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
597
598 return -EINVAL;
599 }
600
601 if (mem->mem_obj == NULL)
602 rv = siw_rx_kva(srx,
603 (void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd),
604 bytes);
605 else if (!mem->is_pbl)
606 rv = siw_rx_umem(srx, mem->umem,
607 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
608 else
609 rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
610 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
611
612 if (unlikely(rv != bytes)) {
613 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
614 DDP_ETYPE_CATASTROPHIC,
615 DDP_ECODE_CATASTROPHIC, 0);
616 return -EINVAL;
617 }
618 srx->fpdu_part_rem -= rv;
619 srx->fpdu_part_rcvd += rv;
620
621 if (!srx->fpdu_part_rem) {
622 srx->ddp_to += srx->fpdu_part_rcvd;
623 return 0;
624 }
625 return -EAGAIN;
626 }
627
628 /*
629 * Inbound RREQ's cannot carry user data.
630 */
siw_proc_rreq(struct siw_qp * qp)631 int siw_proc_rreq(struct siw_qp *qp)
632 {
633 struct siw_rx_stream *srx = &qp->rx_stream;
634
635 if (!srx->fpdu_part_rem)
636 return 0;
637
638 pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
639 be16_to_cpu(srx->hdr.ctrl.mpa_len));
640
641 return -EPROTO;
642 }
643
644 /*
645 * siw_init_rresp:
646 *
647 * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
648 * Put it at the tail of the IRQ, if there is another WQE currently in
649 * transmit processing. If not, make it the current WQE to be processed
650 * and schedule transmit processing.
651 *
652 * Can be called from softirq context and from process
653 * context (RREAD socket loopback case!)
654 *
655 * return value:
656 * 0: success,
657 * failure code otherwise
658 */
659
siw_init_rresp(struct siw_qp * qp,struct siw_rx_stream * srx)660 static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
661 {
662 struct siw_wqe *tx_work = tx_wqe(qp);
663 struct siw_sqe *resp;
664
665 uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
666 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
667 uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
668 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
669 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
670 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
671
672 int run_sq = 1, rv = 0;
673 unsigned long flags;
674
675 if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
676 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
677 DDP_ETYPE_UNTAGGED_BUF,
678 DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
679 return -EPROTO;
680 }
681 spin_lock_irqsave(&qp->sq_lock, flags);
682
683 if (unlikely(!qp->attrs.irq_size)) {
684 run_sq = 0;
685 goto error_irq;
686 }
687 if (tx_work->wr_status == SIW_WR_IDLE) {
688 /*
689 * immediately schedule READ response w/o
690 * consuming IRQ entry: IRQ must be empty.
691 */
692 tx_work->processed = 0;
693 tx_work->mem[0] = NULL;
694 tx_work->wr_status = SIW_WR_QUEUED;
695 resp = &tx_work->sqe;
696 } else {
697 resp = irq_alloc_free(qp);
698 run_sq = 0;
699 }
700 if (likely(resp)) {
701 resp->opcode = SIW_OP_READ_RESPONSE;
702
703 resp->sge[0].length = length;
704 resp->sge[0].laddr = laddr;
705 resp->sge[0].lkey = lkey;
706
707 /* Keep aside message sequence number for potential
708 * error reporting during Read Response generation.
709 */
710 resp->sge[1].length = msn;
711
712 resp->raddr = raddr;
713 resp->rkey = rkey;
714 resp->num_sge = length ? 1 : 0;
715
716 /* RRESP now valid as current TX wqe or placed into IRQ */
717 smp_store_mb(resp->flags, SIW_WQE_VALID);
718 } else {
719 error_irq:
720 pr_warn("siw: [QP %u]: IRQ exceeded or null, size %d\n",
721 qp_id(qp), qp->attrs.irq_size);
722
723 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
724 RDMAP_ETYPE_REMOTE_OPERATION,
725 RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
726 rv = -EPROTO;
727 }
728
729 spin_unlock_irqrestore(&qp->sq_lock, flags);
730
731 if (run_sq)
732 rv = siw_sq_start(qp);
733
734 return rv;
735 }
736
737 /*
738 * Only called at start of Read.Resonse processing.
739 * Transfer pending Read from tip of ORQ into currrent rx wqe,
740 * but keep ORQ entry valid until Read.Response processing done.
741 * No Queue locking needed.
742 */
siw_orqe_start_rx(struct siw_qp * qp)743 static int siw_orqe_start_rx(struct siw_qp *qp)
744 {
745 struct siw_sqe *orqe;
746 struct siw_wqe *wqe = NULL;
747
748 if (unlikely(!qp->attrs.orq_size))
749 return -EPROTO;
750
751 /* make sure ORQ indices are current */
752 smp_mb();
753
754 orqe = orq_get_current(qp);
755 if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
756 /* RRESP is a TAGGED RDMAP operation */
757 wqe = rx_wqe(&qp->rx_tagged);
758 wqe->sqe.id = orqe->id;
759 wqe->sqe.opcode = orqe->opcode;
760 wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
761 wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
762 wqe->sqe.sge[0].length = orqe->sge[0].length;
763 wqe->sqe.flags = orqe->flags;
764 wqe->sqe.num_sge = 1;
765 wqe->bytes = orqe->sge[0].length;
766 wqe->processed = 0;
767 wqe->mem[0] = NULL;
768 /* make sure WQE is completely written before valid */
769 smp_wmb();
770 wqe->wr_status = SIW_WR_INPROGRESS;
771
772 return 0;
773 }
774 return -EPROTO;
775 }
776
777 /*
778 * siw_proc_rresp:
779 *
780 * Place incoming RRESP data into memory referenced by RREQ WQE
781 * which is at the tip of the ORQ
782 *
783 * Function supports partially received RRESP's (suspending/resuming
784 * current receive processing)
785 */
siw_proc_rresp(struct siw_qp * qp)786 int siw_proc_rresp(struct siw_qp *qp)
787 {
788 struct siw_rx_stream *srx = &qp->rx_stream;
789 struct siw_rx_fpdu *frx = &qp->rx_tagged;
790 struct siw_wqe *wqe = rx_wqe(frx);
791 struct siw_mem **mem, *mem_p;
792 struct siw_sge *sge;
793 int bytes, rv;
794
795 if (frx->first_ddp_seg) {
796 if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
797 pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
798 qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
799 rv = -EPROTO;
800 goto error_term;
801 }
802 /*
803 * fetch pending RREQ from orq
804 */
805 rv = siw_orqe_start_rx(qp);
806 if (rv) {
807 pr_warn("siw: [QP %u]: ORQ empty, size %d\n",
808 qp_id(qp), qp->attrs.orq_size);
809 goto error_term;
810 }
811 rv = siw_rresp_check_ntoh(srx, frx);
812 if (unlikely(rv)) {
813 siw_qp_event(qp, IB_EVENT_QP_FATAL);
814 return rv;
815 }
816 } else {
817 if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
818 pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
819 qp_id(qp), wqe->wr_status);
820 rv = -EPROTO;
821 goto error_term;
822 }
823 }
824 if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
825 return 0;
826
827 sge = wqe->sqe.sge; /* there is only one */
828 mem = &wqe->mem[0];
829
830 if (!(*mem)) {
831 /*
832 * check target memory which resolves memory on first fragment
833 */
834 rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
835 wqe->bytes);
836 if (unlikely(rv)) {
837 siw_dbg_qp(qp, "target mem check: %d\n", rv);
838 wqe->wc_status = SIW_WC_LOC_PROT_ERR;
839
840 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
841 DDP_ETYPE_TAGGED_BUF,
842 siw_tagged_error(-rv), 0);
843
844 siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
845
846 return -EINVAL;
847 }
848 }
849 mem_p = *mem;
850
851 bytes = min(srx->fpdu_part_rem, srx->skb_new);
852
853 if (mem_p->mem_obj == NULL)
854 rv = siw_rx_kva(srx,
855 (void *)(uintptr_t)(sge->laddr + wqe->processed),
856 bytes);
857 else if (!mem_p->is_pbl)
858 rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
859 bytes);
860 else
861 rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
862 sge->laddr + wqe->processed, bytes);
863 if (rv != bytes) {
864 wqe->wc_status = SIW_WC_GENERAL_ERR;
865 rv = -EINVAL;
866 goto error_term;
867 }
868 srx->fpdu_part_rem -= rv;
869 srx->fpdu_part_rcvd += rv;
870 wqe->processed += rv;
871
872 if (!srx->fpdu_part_rem) {
873 srx->ddp_to += srx->fpdu_part_rcvd;
874 return 0;
875 }
876 return -EAGAIN;
877
878 error_term:
879 siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
880 DDP_ECODE_CATASTROPHIC, 0);
881 return rv;
882 }
883
siw_proc_terminate(struct siw_qp * qp)884 int siw_proc_terminate(struct siw_qp *qp)
885 {
886 struct siw_rx_stream *srx = &qp->rx_stream;
887 struct sk_buff *skb = srx->skb;
888 struct iwarp_terminate *term = &srx->hdr.terminate;
889 union iwarp_hdr term_info;
890 u8 *infop = (u8 *)&term_info;
891 enum rdma_opcode op;
892 u16 to_copy = sizeof(struct iwarp_ctrl);
893
894 pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
895 __rdmap_term_layer(term), __rdmap_term_etype(term),
896 __rdmap_term_ecode(term));
897
898 if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
899 be32_to_cpu(term->ddp_msn) !=
900 qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
901 be32_to_cpu(term->ddp_mo) != 0) {
902 pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
903 be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
904 be32_to_cpu(term->ddp_mo));
905 return -ECONNRESET;
906 }
907 /*
908 * Receive remaining pieces of TERM if indicated
909 */
910 if (!term->flag_m)
911 return -ECONNRESET;
912
913 /* Do not take the effort to reassemble a network fragmented
914 * TERM message
915 */
916 if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
917 return -ECONNRESET;
918
919 memset(infop, 0, sizeof(term_info));
920
921 skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
922
923 op = __rdmap_get_opcode(&term_info.ctrl);
924 if (op >= RDMAP_TERMINATE)
925 goto out;
926
927 infop += to_copy;
928 srx->skb_offset += to_copy;
929 srx->skb_new -= to_copy;
930 srx->skb_copied += to_copy;
931 srx->fpdu_part_rcvd += to_copy;
932 srx->fpdu_part_rem -= to_copy;
933
934 to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
935
936 /* Again, no network fragmented TERM's */
937 if (to_copy + MPA_CRC_SIZE > srx->skb_new)
938 return -ECONNRESET;
939
940 skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
941
942 if (term->flag_r) {
943 siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
944 op, be16_to_cpu(term_info.ctrl.mpa_len),
945 term->flag_m ? "valid" : "invalid");
946 } else if (term->flag_d) {
947 siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
948 op, be16_to_cpu(term_info.ctrl.mpa_len),
949 term->flag_m ? "valid" : "invalid");
950 }
951 out:
952 srx->skb_new -= to_copy;
953 srx->skb_offset += to_copy;
954 srx->skb_copied += to_copy;
955 srx->fpdu_part_rcvd += to_copy;
956 srx->fpdu_part_rem -= to_copy;
957
958 return -ECONNRESET;
959 }
960
siw_get_trailer(struct siw_qp * qp,struct siw_rx_stream * srx)961 static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
962 {
963 struct sk_buff *skb = srx->skb;
964 int avail = min(srx->skb_new, srx->fpdu_part_rem);
965 u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
966 __wsum crc_in, crc_own = 0;
967
968 siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
969 srx->fpdu_part_rem, srx->skb_new, srx->pad);
970
971 skb_copy_bits(skb, srx->skb_offset, tbuf, avail);
972
973 srx->skb_new -= avail;
974 srx->skb_offset += avail;
975 srx->skb_copied += avail;
976 srx->fpdu_part_rem -= avail;
977
978 if (srx->fpdu_part_rem)
979 return -EAGAIN;
980
981 if (!srx->mpa_crc_hd)
982 return 0;
983
984 if (srx->pad)
985 crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
986 /*
987 * CRC32 is computed, transmitted and received directly in NBO,
988 * so there's never a reason to convert byte order.
989 */
990 crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
991 crc_in = (__force __wsum)srx->trailer.crc;
992
993 if (unlikely(crc_in != crc_own)) {
994 pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
995 crc_in, crc_own, qp->rx_stream.rdmap_op);
996
997 siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
998 LLP_ETYPE_MPA,
999 LLP_ECODE_RECEIVED_CRC, 0);
1000 return -EINVAL;
1001 }
1002 return 0;
1003 }
1004
1005 #define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
1006
siw_get_hdr(struct siw_rx_stream * srx)1007 static int siw_get_hdr(struct siw_rx_stream *srx)
1008 {
1009 struct sk_buff *skb = srx->skb;
1010 struct siw_qp *qp = rx_qp(srx);
1011 struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
1012 struct siw_rx_fpdu *frx;
1013 u8 opcode;
1014 int bytes;
1015
1016 if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
1017 /*
1018 * copy a mimimum sized (tagged) DDP frame control part
1019 */
1020 bytes = min_t(int, srx->skb_new,
1021 MIN_DDP_HDR - srx->fpdu_part_rcvd);
1022
1023 skb_copy_bits(skb, srx->skb_offset,
1024 (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1025
1026 srx->fpdu_part_rcvd += bytes;
1027
1028 srx->skb_new -= bytes;
1029 srx->skb_offset += bytes;
1030 srx->skb_copied += bytes;
1031
1032 if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
1033 return -EAGAIN;
1034
1035 if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
1036 enum ddp_etype etype;
1037 enum ddp_ecode ecode;
1038
1039 pr_warn("siw: received ddp version unsupported %d\n",
1040 __ddp_get_version(c_hdr));
1041
1042 if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
1043 etype = DDP_ETYPE_TAGGED_BUF;
1044 ecode = DDP_ECODE_T_VERSION;
1045 } else {
1046 etype = DDP_ETYPE_UNTAGGED_BUF;
1047 ecode = DDP_ECODE_UT_VERSION;
1048 }
1049 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
1050 etype, ecode, 0);
1051 return -EINVAL;
1052 }
1053 if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
1054 pr_warn("siw: received rdmap version unsupported %d\n",
1055 __rdmap_get_version(c_hdr));
1056
1057 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1058 RDMAP_ETYPE_REMOTE_OPERATION,
1059 RDMAP_ECODE_VERSION, 0);
1060 return -EINVAL;
1061 }
1062 opcode = __rdmap_get_opcode(c_hdr);
1063
1064 if (opcode > RDMAP_TERMINATE) {
1065 pr_warn("siw: received unknown packet type %u\n",
1066 opcode);
1067
1068 siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1069 RDMAP_ETYPE_REMOTE_OPERATION,
1070 RDMAP_ECODE_OPCODE, 0);
1071 return -EINVAL;
1072 }
1073 siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
1074 } else {
1075 opcode = __rdmap_get_opcode(c_hdr);
1076 }
1077 set_rx_fpdu_context(qp, opcode);
1078 frx = qp->rx_fpdu;
1079
1080 /*
1081 * Figure out len of current hdr: variable length of
1082 * iwarp hdr may force us to copy hdr information in
1083 * two steps. Only tagged DDP messages are already
1084 * completely received.
1085 */
1086 if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
1087 int hdrlen = iwarp_pktinfo[opcode].hdr_len;
1088
1089 bytes = min_t(int, hdrlen - MIN_DDP_HDR, srx->skb_new);
1090
1091 skb_copy_bits(skb, srx->skb_offset,
1092 (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1093
1094 srx->fpdu_part_rcvd += bytes;
1095
1096 srx->skb_new -= bytes;
1097 srx->skb_offset += bytes;
1098 srx->skb_copied += bytes;
1099
1100 if (srx->fpdu_part_rcvd < hdrlen)
1101 return -EAGAIN;
1102 }
1103
1104 /*
1105 * DDP/RDMAP header receive completed. Check if the current
1106 * DDP segment starts a new RDMAP message or continues a previously
1107 * started RDMAP message.
1108 *
1109 * Alternating reception of DDP segments (or FPDUs) from incomplete
1110 * tagged and untagged RDMAP messages is supported, as long as
1111 * the current tagged or untagged message gets eventually completed
1112 * w/o intersection from another message of the same type
1113 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
1114 * but not by a READ RESPONSE etc.
1115 */
1116 if (srx->mpa_crc_hd) {
1117 /*
1118 * Restart CRC computation
1119 */
1120 crypto_shash_init(srx->mpa_crc_hd);
1121 crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
1122 srx->fpdu_part_rcvd);
1123 }
1124 if (frx->more_ddp_segs) {
1125 frx->first_ddp_seg = 0;
1126 if (frx->prev_rdmap_op != opcode) {
1127 pr_warn("siw: packet intersection: %u : %u\n",
1128 frx->prev_rdmap_op, opcode);
1129 /*
1130 * The last inbound RDMA operation of same type
1131 * (tagged or untagged) is left unfinished.
1132 * To complete it in error, make it the current
1133 * operation again, even with the header already
1134 * overwritten. For error handling, only the opcode
1135 * and current rx context are relevant.
1136 */
1137 set_rx_fpdu_context(qp, frx->prev_rdmap_op);
1138 __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
1139 return -EPROTO;
1140 }
1141 } else {
1142 frx->prev_rdmap_op = opcode;
1143 frx->first_ddp_seg = 1;
1144 }
1145 frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
1146
1147 return 0;
1148 }
1149
siw_check_tx_fence(struct siw_qp * qp)1150 static int siw_check_tx_fence(struct siw_qp *qp)
1151 {
1152 struct siw_wqe *tx_waiting = tx_wqe(qp);
1153 struct siw_sqe *rreq;
1154 int resume_tx = 0, rv = 0;
1155 unsigned long flags;
1156
1157 spin_lock_irqsave(&qp->orq_lock, flags);
1158
1159 /* free current orq entry */
1160 rreq = orq_get_current(qp);
1161 WRITE_ONCE(rreq->flags, 0);
1162
1163 qp->orq_get++;
1164
1165 if (qp->tx_ctx.orq_fence) {
1166 if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
1167 pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
1168 qp_id(qp), tx_waiting->wr_status);
1169 rv = -EPROTO;
1170 goto out;
1171 }
1172 /* resume SQ processing, if possible */
1173 if (tx_waiting->sqe.opcode == SIW_OP_READ ||
1174 tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
1175
1176 /* SQ processing was stopped because of a full ORQ */
1177 rreq = orq_get_free(qp);
1178 if (unlikely(!rreq)) {
1179 pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
1180 rv = -EPROTO;
1181 goto out;
1182 }
1183 siw_read_to_orq(rreq, &tx_waiting->sqe);
1184
1185 qp->orq_put++;
1186 qp->tx_ctx.orq_fence = 0;
1187 resume_tx = 1;
1188
1189 } else if (siw_orq_empty(qp)) {
1190 /*
1191 * SQ processing was stopped by fenced work request.
1192 * Resume since all previous Read's are now completed.
1193 */
1194 qp->tx_ctx.orq_fence = 0;
1195 resume_tx = 1;
1196 }
1197 }
1198 out:
1199 spin_unlock_irqrestore(&qp->orq_lock, flags);
1200
1201 if (resume_tx)
1202 rv = siw_sq_start(qp);
1203
1204 return rv;
1205 }
1206
1207 /*
1208 * siw_rdmap_complete()
1209 *
1210 * Complete processing of an RDMA message after receiving all
1211 * DDP segmens or ABort processing after encountering error case.
1212 *
1213 * o SENDs + RRESPs will need for completion,
1214 * o RREQs need for READ RESPONSE initialization
1215 * o WRITEs need memory dereferencing
1216 *
1217 * TODO: Failed WRITEs need local error to be surfaced.
1218 */
siw_rdmap_complete(struct siw_qp * qp,int error)1219 static int siw_rdmap_complete(struct siw_qp *qp, int error)
1220 {
1221 struct siw_rx_stream *srx = &qp->rx_stream;
1222 struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
1223 enum siw_wc_status wc_status = wqe->wc_status;
1224 u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
1225 int rv = 0;
1226
1227 switch (opcode) {
1228 case RDMAP_SEND_SE:
1229 case RDMAP_SEND_SE_INVAL:
1230 wqe->rqe.flags |= SIW_WQE_SOLICITED;
1231 fallthrough;
1232
1233 case RDMAP_SEND:
1234 case RDMAP_SEND_INVAL:
1235 if (wqe->wr_status == SIW_WR_IDLE)
1236 break;
1237
1238 srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
1239
1240 if (error != 0 && wc_status == SIW_WC_SUCCESS)
1241 wc_status = SIW_WC_GENERAL_ERR;
1242 /*
1243 * Handle STag invalidation request
1244 */
1245 if (wc_status == SIW_WC_SUCCESS &&
1246 (opcode == RDMAP_SEND_INVAL ||
1247 opcode == RDMAP_SEND_SE_INVAL)) {
1248 rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
1249 if (rv) {
1250 siw_init_terminate(
1251 qp, TERM_ERROR_LAYER_RDMAP,
1252 rv == -EACCES ?
1253 RDMAP_ETYPE_REMOTE_PROTECTION :
1254 RDMAP_ETYPE_REMOTE_OPERATION,
1255 RDMAP_ECODE_CANNOT_INVALIDATE, 0);
1256
1257 wc_status = SIW_WC_REM_INV_REQ_ERR;
1258 }
1259 rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1260 rv ? 0 : srx->inval_stag,
1261 wc_status);
1262 } else {
1263 rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1264 0, wc_status);
1265 }
1266 siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
1267 break;
1268
1269 case RDMAP_RDMA_READ_RESP:
1270 if (wqe->wr_status == SIW_WR_IDLE)
1271 break;
1272
1273 if (error != 0) {
1274 if ((srx->state == SIW_GET_HDR &&
1275 qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
1276 /* possible RREQ in ORQ left untouched */
1277 break;
1278
1279 if (wc_status == SIW_WC_SUCCESS)
1280 wc_status = SIW_WC_GENERAL_ERR;
1281 } else if (rdma_is_kernel_res(&qp->base_qp.res) &&
1282 rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
1283 /*
1284 * Handle any STag invalidation request
1285 */
1286 rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
1287 if (rv) {
1288 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
1289 RDMAP_ETYPE_CATASTROPHIC,
1290 RDMAP_ECODE_UNSPECIFIED, 0);
1291
1292 if (wc_status == SIW_WC_SUCCESS) {
1293 wc_status = SIW_WC_GENERAL_ERR;
1294 error = rv;
1295 }
1296 }
1297 }
1298 /*
1299 * All errors turn the wqe into signalled.
1300 */
1301 if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
1302 rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
1303 wc_status);
1304 siw_wqe_put_mem(wqe, SIW_OP_READ);
1305
1306 if (!error) {
1307 rv = siw_check_tx_fence(qp);
1308 } else {
1309 /* Disable current ORQ element */
1310 if (qp->attrs.orq_size)
1311 WRITE_ONCE(orq_get_current(qp)->flags, 0);
1312 }
1313 break;
1314
1315 case RDMAP_RDMA_READ_REQ:
1316 if (!error) {
1317 rv = siw_init_rresp(qp, srx);
1318 srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
1319 }
1320 break;
1321
1322 case RDMAP_RDMA_WRITE:
1323 if (wqe->wr_status == SIW_WR_IDLE)
1324 break;
1325
1326 /*
1327 * Free References from memory object if
1328 * attached to receive context (inbound WRITE).
1329 * While a zero-length WRITE is allowed,
1330 * no memory reference got created.
1331 */
1332 if (rx_mem(&qp->rx_tagged)) {
1333 siw_mem_put(rx_mem(&qp->rx_tagged));
1334 rx_mem(&qp->rx_tagged) = NULL;
1335 }
1336 break;
1337
1338 default:
1339 break;
1340 }
1341 wqe->wr_status = SIW_WR_IDLE;
1342
1343 return rv;
1344 }
1345
1346 /*
1347 * siw_tcp_rx_data()
1348 *
1349 * Main routine to consume inbound TCP payload
1350 *
1351 * @rd_desc: read descriptor
1352 * @skb: socket buffer
1353 * @off: offset in skb
1354 * @len: skb->len - offset : payload in skb
1355 */
siw_tcp_rx_data(read_descriptor_t * rd_desc,struct sk_buff * skb,unsigned int off,size_t len)1356 int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
1357 unsigned int off, size_t len)
1358 {
1359 struct siw_qp *qp = rd_desc->arg.data;
1360 struct siw_rx_stream *srx = &qp->rx_stream;
1361 int rv;
1362
1363 srx->skb = skb;
1364 srx->skb_new = skb->len - off;
1365 srx->skb_offset = off;
1366 srx->skb_copied = 0;
1367
1368 siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
1369
1370 while (srx->skb_new) {
1371 int run_completion = 1;
1372
1373 if (unlikely(srx->rx_suspend)) {
1374 /* Do not process any more data */
1375 srx->skb_copied += srx->skb_new;
1376 break;
1377 }
1378 switch (srx->state) {
1379 case SIW_GET_HDR:
1380 rv = siw_get_hdr(srx);
1381 if (!rv) {
1382 srx->fpdu_part_rem =
1383 be16_to_cpu(srx->hdr.ctrl.mpa_len) -
1384 srx->fpdu_part_rcvd + MPA_HDR_SIZE;
1385
1386 if (srx->fpdu_part_rem)
1387 srx->pad = -srx->fpdu_part_rem & 0x3;
1388 else
1389 srx->pad = 0;
1390
1391 srx->state = SIW_GET_DATA_START;
1392 srx->fpdu_part_rcvd = 0;
1393 }
1394 break;
1395
1396 case SIW_GET_DATA_MORE:
1397 /*
1398 * Another data fragment of the same DDP segment.
1399 * Setting first_ddp_seg = 0 avoids repeating
1400 * initializations that shall occur only once per
1401 * DDP segment.
1402 */
1403 qp->rx_fpdu->first_ddp_seg = 0;
1404 fallthrough;
1405
1406 case SIW_GET_DATA_START:
1407 /*
1408 * Headers will be checked by the opcode-specific
1409 * data receive function below.
1410 */
1411 rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
1412 if (!rv) {
1413 int mpa_len =
1414 be16_to_cpu(srx->hdr.ctrl.mpa_len)
1415 + MPA_HDR_SIZE;
1416
1417 srx->fpdu_part_rem = (-mpa_len & 0x3)
1418 + MPA_CRC_SIZE;
1419 srx->fpdu_part_rcvd = 0;
1420 srx->state = SIW_GET_TRAILER;
1421 } else {
1422 if (unlikely(rv == -ECONNRESET))
1423 run_completion = 0;
1424 else
1425 srx->state = SIW_GET_DATA_MORE;
1426 }
1427 break;
1428
1429 case SIW_GET_TRAILER:
1430 /*
1431 * read CRC + any padding
1432 */
1433 rv = siw_get_trailer(qp, srx);
1434 if (likely(!rv)) {
1435 /*
1436 * FPDU completed.
1437 * complete RDMAP message if last fragment
1438 */
1439 srx->state = SIW_GET_HDR;
1440 srx->fpdu_part_rcvd = 0;
1441
1442 if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
1443 DDP_FLAG_LAST))
1444 /* more frags */
1445 break;
1446
1447 rv = siw_rdmap_complete(qp, 0);
1448 run_completion = 0;
1449 }
1450 break;
1451
1452 default:
1453 pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
1454 rv = -EPROTO;
1455 run_completion = 0;
1456 }
1457 if (unlikely(rv != 0 && rv != -EAGAIN)) {
1458 if ((srx->state > SIW_GET_HDR ||
1459 qp->rx_fpdu->more_ddp_segs) && run_completion)
1460 siw_rdmap_complete(qp, rv);
1461
1462 siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
1463 srx->state);
1464
1465 siw_qp_cm_drop(qp, 1);
1466
1467 break;
1468 }
1469 if (rv) {
1470 siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
1471 srx->state, srx->fpdu_part_rem);
1472 break;
1473 }
1474 }
1475 return srx->skb_copied;
1476 }
1477