1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2 /*
3  * Copyright(c) 2015 - 2018 Intel Corporation.
4  */
5 
6 #include <linux/io.h>
7 #include <rdma/rdma_vt.h>
8 #include <rdma/rdmavt_qp.h>
9 
10 #include "hfi.h"
11 #include "qp.h"
12 #include "rc.h"
13 #include "verbs_txreq.h"
14 #include "trace.h"
15 
find_prev_entry(struct rvt_qp * qp,u32 psn,u8 * prev,u8 * prev_ack,bool * scheduled)16 struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
17 				      u8 *prev_ack, bool *scheduled)
18 	__must_hold(&qp->s_lock)
19 {
20 	struct rvt_ack_entry *e = NULL;
21 	u8 i, p;
22 	bool s = true;
23 
24 	for (i = qp->r_head_ack_queue; ; i = p) {
25 		if (i == qp->s_tail_ack_queue)
26 			s = false;
27 		if (i)
28 			p = i - 1;
29 		else
30 			p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
31 		if (p == qp->r_head_ack_queue) {
32 			e = NULL;
33 			break;
34 		}
35 		e = &qp->s_ack_queue[p];
36 		if (!e->opcode) {
37 			e = NULL;
38 			break;
39 		}
40 		if (cmp_psn(psn, e->psn) >= 0) {
41 			if (p == qp->s_tail_ack_queue &&
42 			    cmp_psn(psn, e->lpsn) <= 0)
43 				s = false;
44 			break;
45 		}
46 	}
47 	if (prev)
48 		*prev = p;
49 	if (prev_ack)
50 		*prev_ack = i;
51 	if (scheduled)
52 		*scheduled = s;
53 	return e;
54 }
55 
56 /**
57  * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
58  * @dev: the device for this QP
59  * @qp: a pointer to the QP
60  * @ohdr: a pointer to the IB header being constructed
61  * @ps: the xmit packet state
62  *
63  * Return 1 if constructed; otherwise, return 0.
64  * Note that we are in the responder's side of the QP context.
65  * Note the QP s_lock must be held.
66  */
make_rc_ack(struct hfi1_ibdev * dev,struct rvt_qp * qp,struct ib_other_headers * ohdr,struct hfi1_pkt_state * ps)67 static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
68 		       struct ib_other_headers *ohdr,
69 		       struct hfi1_pkt_state *ps)
70 {
71 	struct rvt_ack_entry *e;
72 	u32 hwords, hdrlen;
73 	u32 len = 0;
74 	u32 bth0 = 0, bth2 = 0;
75 	u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
76 	int middle = 0;
77 	u32 pmtu = qp->pmtu;
78 	struct hfi1_qp_priv *qpriv = qp->priv;
79 	bool last_pkt;
80 	u32 delta;
81 	u8 next = qp->s_tail_ack_queue;
82 	struct tid_rdma_request *req;
83 
84 	trace_hfi1_rsp_make_rc_ack(qp, 0);
85 	lockdep_assert_held(&qp->s_lock);
86 	/* Don't send an ACK if we aren't supposed to. */
87 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
88 		goto bail;
89 
90 	if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
91 		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
92 		hwords = 5;
93 	else
94 		/* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
95 		hwords = 7;
96 
97 	switch (qp->s_ack_state) {
98 	case OP(RDMA_READ_RESPONSE_LAST):
99 	case OP(RDMA_READ_RESPONSE_ONLY):
100 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
101 		release_rdma_sge_mr(e);
102 		fallthrough;
103 	case OP(ATOMIC_ACKNOWLEDGE):
104 		/*
105 		 * We can increment the tail pointer now that the last
106 		 * response has been sent instead of only being
107 		 * constructed.
108 		 */
109 		if (++next > rvt_size_atomic(&dev->rdi))
110 			next = 0;
111 		/*
112 		 * Only advance the s_acked_ack_queue pointer if there
113 		 * have been no TID RDMA requests.
114 		 */
115 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
116 		if (e->opcode != TID_OP(WRITE_REQ) &&
117 		    qp->s_acked_ack_queue == qp->s_tail_ack_queue)
118 			qp->s_acked_ack_queue = next;
119 		qp->s_tail_ack_queue = next;
120 		trace_hfi1_rsp_make_rc_ack(qp, e->psn);
121 		fallthrough;
122 	case OP(SEND_ONLY):
123 	case OP(ACKNOWLEDGE):
124 		/* Check for no next entry in the queue. */
125 		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
126 			if (qp->s_flags & RVT_S_ACK_PENDING)
127 				goto normal;
128 			goto bail;
129 		}
130 
131 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
132 		/* Check for tid write fence */
133 		if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) ||
134 		    hfi1_tid_rdma_ack_interlock(qp, e)) {
135 			iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB);
136 			goto bail;
137 		}
138 		if (e->opcode == OP(RDMA_READ_REQUEST)) {
139 			/*
140 			 * If a RDMA read response is being resent and
141 			 * we haven't seen the duplicate request yet,
142 			 * then stop sending the remaining responses the
143 			 * responder has seen until the requester re-sends it.
144 			 */
145 			len = e->rdma_sge.sge_length;
146 			if (len && !e->rdma_sge.mr) {
147 				if (qp->s_acked_ack_queue ==
148 				    qp->s_tail_ack_queue)
149 					qp->s_acked_ack_queue =
150 						qp->r_head_ack_queue;
151 				qp->s_tail_ack_queue = qp->r_head_ack_queue;
152 				goto bail;
153 			}
154 			/* Copy SGE state in case we need to resend */
155 			ps->s_txreq->mr = e->rdma_sge.mr;
156 			if (ps->s_txreq->mr)
157 				rvt_get_mr(ps->s_txreq->mr);
158 			qp->s_ack_rdma_sge.sge = e->rdma_sge;
159 			qp->s_ack_rdma_sge.num_sge = 1;
160 			ps->s_txreq->ss = &qp->s_ack_rdma_sge;
161 			if (len > pmtu) {
162 				len = pmtu;
163 				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
164 			} else {
165 				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
166 				e->sent = 1;
167 			}
168 			ohdr->u.aeth = rvt_compute_aeth(qp);
169 			hwords++;
170 			qp->s_ack_rdma_psn = e->psn;
171 			bth2 = mask_psn(qp->s_ack_rdma_psn++);
172 		} else if (e->opcode == TID_OP(WRITE_REQ)) {
173 			/*
174 			 * If a TID RDMA WRITE RESP is being resent, we have to
175 			 * wait for the actual request. All requests that are to
176 			 * be resent will have their state set to
177 			 * TID_REQUEST_RESEND. When the new request arrives, the
178 			 * state will be changed to TID_REQUEST_RESEND_ACTIVE.
179 			 */
180 			req = ack_to_tid_req(e);
181 			if (req->state == TID_REQUEST_RESEND ||
182 			    req->state == TID_REQUEST_INIT_RESEND)
183 				goto bail;
184 			qp->s_ack_state = TID_OP(WRITE_RESP);
185 			qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
186 			goto write_resp;
187 		} else if (e->opcode == TID_OP(READ_REQ)) {
188 			/*
189 			 * If a TID RDMA read response is being resent and
190 			 * we haven't seen the duplicate request yet,
191 			 * then stop sending the remaining responses the
192 			 * responder has seen until the requester re-sends it.
193 			 */
194 			len = e->rdma_sge.sge_length;
195 			if (len && !e->rdma_sge.mr) {
196 				if (qp->s_acked_ack_queue ==
197 				    qp->s_tail_ack_queue)
198 					qp->s_acked_ack_queue =
199 						qp->r_head_ack_queue;
200 				qp->s_tail_ack_queue = qp->r_head_ack_queue;
201 				goto bail;
202 			}
203 			/* Copy SGE state in case we need to resend */
204 			ps->s_txreq->mr = e->rdma_sge.mr;
205 			if (ps->s_txreq->mr)
206 				rvt_get_mr(ps->s_txreq->mr);
207 			qp->s_ack_rdma_sge.sge = e->rdma_sge;
208 			qp->s_ack_rdma_sge.num_sge = 1;
209 			qp->s_ack_state = TID_OP(READ_RESP);
210 			goto read_resp;
211 		} else {
212 			/* COMPARE_SWAP or FETCH_ADD */
213 			ps->s_txreq->ss = NULL;
214 			len = 0;
215 			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
216 			ohdr->u.at.aeth = rvt_compute_aeth(qp);
217 			ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth);
218 			hwords += sizeof(ohdr->u.at) / sizeof(u32);
219 			bth2 = mask_psn(e->psn);
220 			e->sent = 1;
221 		}
222 		trace_hfi1_tid_write_rsp_make_rc_ack(qp);
223 		bth0 = qp->s_ack_state << 24;
224 		break;
225 
226 	case OP(RDMA_READ_RESPONSE_FIRST):
227 		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
228 		fallthrough;
229 	case OP(RDMA_READ_RESPONSE_MIDDLE):
230 		ps->s_txreq->ss = &qp->s_ack_rdma_sge;
231 		ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
232 		if (ps->s_txreq->mr)
233 			rvt_get_mr(ps->s_txreq->mr);
234 		len = qp->s_ack_rdma_sge.sge.sge_length;
235 		if (len > pmtu) {
236 			len = pmtu;
237 			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
238 		} else {
239 			ohdr->u.aeth = rvt_compute_aeth(qp);
240 			hwords++;
241 			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
242 			e = &qp->s_ack_queue[qp->s_tail_ack_queue];
243 			e->sent = 1;
244 		}
245 		bth0 = qp->s_ack_state << 24;
246 		bth2 = mask_psn(qp->s_ack_rdma_psn++);
247 		break;
248 
249 	case TID_OP(WRITE_RESP):
250 write_resp:
251 		/*
252 		 * 1. Check if RVT_S_ACK_PENDING is set. If yes,
253 		 *    goto normal.
254 		 * 2. Attempt to allocate TID resources.
255 		 * 3. Remove RVT_S_RESP_PENDING flags from s_flags
256 		 * 4. If resources not available:
257 		 *    4.1 Set RVT_S_WAIT_TID_SPACE
258 		 *    4.2 Queue QP on RCD TID queue
259 		 *    4.3 Put QP on iowait list.
260 		 *    4.4 Build IB RNR NAK with appropriate timeout value
261 		 *    4.5 Return indication progress made.
262 		 * 5. If resources are available:
263 		 *    5.1 Program HW flow CSRs
264 		 *    5.2 Build TID RDMA WRITE RESP packet
265 		 *    5.3 If more resources needed, do 2.1 - 2.3.
266 		 *    5.4 Wake up next QP on RCD TID queue.
267 		 *    5.5 Return indication progress made.
268 		 */
269 
270 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
271 		req = ack_to_tid_req(e);
272 
273 		/*
274 		 * Send scheduled RNR NAK's. RNR NAK's need to be sent at
275 		 * segment boundaries, not at request boundaries. Don't change
276 		 * s_ack_state because we are still in the middle of a request
277 		 */
278 		if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
279 		    qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
280 		    req->cur_seg == req->alloc_seg) {
281 			qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
282 			goto normal_no_state;
283 		}
284 
285 		bth2 = mask_psn(qp->s_ack_rdma_psn);
286 		hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
287 							bth2, &len,
288 							&ps->s_txreq->ss);
289 		if (!hdrlen)
290 			return 0;
291 
292 		hwords += hdrlen;
293 		bth0 = qp->s_ack_state << 24;
294 		qp->s_ack_rdma_psn++;
295 		trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn,
296 						     e->lpsn, req);
297 		if (req->cur_seg != req->total_segs)
298 			break;
299 
300 		e->sent = 1;
301 		/* Do not free e->rdma_sge until all data are received */
302 		qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
303 		break;
304 
305 	case TID_OP(READ_RESP):
306 read_resp:
307 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
308 		ps->s_txreq->ss = &qp->s_ack_rdma_sge;
309 		delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0,
310 						      &bth1, &bth2, &len,
311 						      &last_pkt);
312 		if (delta == 0)
313 			goto error_qp;
314 		hwords += delta;
315 		if (last_pkt) {
316 			e->sent = 1;
317 			/*
318 			 * Increment qp->s_tail_ack_queue through s_ack_state
319 			 * transition.
320 			 */
321 			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
322 		}
323 		break;
324 	case TID_OP(READ_REQ):
325 		goto bail;
326 
327 	default:
328 normal:
329 		/*
330 		 * Send a regular ACK.
331 		 * Set the s_ack_state so we wait until after sending
332 		 * the ACK before setting s_ack_state to ACKNOWLEDGE
333 		 * (see above).
334 		 */
335 		qp->s_ack_state = OP(SEND_ONLY);
336 normal_no_state:
337 		if (qp->s_nak_state)
338 			ohdr->u.aeth =
339 				cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
340 					    (qp->s_nak_state <<
341 					     IB_AETH_CREDIT_SHIFT));
342 		else
343 			ohdr->u.aeth = rvt_compute_aeth(qp);
344 		hwords++;
345 		len = 0;
346 		bth0 = OP(ACKNOWLEDGE) << 24;
347 		bth2 = mask_psn(qp->s_ack_psn);
348 		qp->s_flags &= ~RVT_S_ACK_PENDING;
349 		ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
350 		ps->s_txreq->ss = NULL;
351 	}
352 	qp->s_rdma_ack_cnt++;
353 	ps->s_txreq->sde = qpriv->s_sde;
354 	ps->s_txreq->s_cur_size = len;
355 	ps->s_txreq->hdr_dwords = hwords;
356 	hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
357 	return 1;
358 error_qp:
359 	spin_unlock_irqrestore(&qp->s_lock, ps->flags);
360 	spin_lock_irqsave(&qp->r_lock, ps->flags);
361 	spin_lock(&qp->s_lock);
362 	rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
363 	spin_unlock(&qp->s_lock);
364 	spin_unlock_irqrestore(&qp->r_lock, ps->flags);
365 	spin_lock_irqsave(&qp->s_lock, ps->flags);
366 bail:
367 	qp->s_ack_state = OP(ACKNOWLEDGE);
368 	/*
369 	 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
370 	 * RVT_S_RESP_PENDING
371 	 */
372 	smp_wmb();
373 	qp->s_flags &= ~(RVT_S_RESP_PENDING
374 				| RVT_S_ACK_PENDING
375 				| HFI1_S_AHG_VALID);
376 	return 0;
377 }
378 
379 /**
380  * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
381  * @qp: a pointer to the QP
382  * @ps: the current packet state
383  *
384  * Assumes s_lock is held.
385  *
386  * Return 1 if constructed; otherwise, return 0.
387  */
hfi1_make_rc_req(struct rvt_qp * qp,struct hfi1_pkt_state * ps)388 int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
389 {
390 	struct hfi1_qp_priv *priv = qp->priv;
391 	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
392 	struct ib_other_headers *ohdr;
393 	struct rvt_sge_state *ss = NULL;
394 	struct rvt_swqe *wqe;
395 	struct hfi1_swqe_priv *wpriv;
396 	struct tid_rdma_request *req = NULL;
397 	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
398 	u32 hwords = 5;
399 	u32 len = 0;
400 	u32 bth0 = 0, bth2 = 0;
401 	u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
402 	u32 pmtu = qp->pmtu;
403 	char newreq;
404 	int middle = 0;
405 	int delta;
406 	struct tid_rdma_flow *flow = NULL;
407 	struct tid_rdma_params *remote;
408 
409 	trace_hfi1_sender_make_rc_req(qp);
410 	lockdep_assert_held(&qp->s_lock);
411 	ps->s_txreq = get_txreq(ps->dev, qp);
412 	if (!ps->s_txreq)
413 		goto bail_no_tx;
414 
415 	if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
416 		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
417 		hwords = 5;
418 		if (rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)
419 			ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth;
420 		else
421 			ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
422 	} else {
423 		/* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
424 		hwords = 7;
425 		if ((rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
426 		    (hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))))
427 			ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth;
428 		else
429 			ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth;
430 	}
431 
432 	/* Sending responses has higher priority over sending requests. */
433 	if ((qp->s_flags & RVT_S_RESP_PENDING) &&
434 	    make_rc_ack(dev, qp, ohdr, ps))
435 		return 1;
436 
437 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
438 		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
439 			goto bail;
440 		/* We are in the error state, flush the work request. */
441 		if (qp->s_last == READ_ONCE(qp->s_head))
442 			goto bail;
443 		/* If DMAs are in progress, we can't flush immediately. */
444 		if (iowait_sdma_pending(&priv->s_iowait)) {
445 			qp->s_flags |= RVT_S_WAIT_DMA;
446 			goto bail;
447 		}
448 		clear_ahg(qp);
449 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
450 		hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
451 					 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
452 		/* will get called again */
453 		goto done_free_tx;
454 	}
455 
456 	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
457 		goto bail;
458 
459 	if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
460 		if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
461 			qp->s_flags |= RVT_S_WAIT_PSN;
462 			goto bail;
463 		}
464 		qp->s_sending_psn = qp->s_psn;
465 		qp->s_sending_hpsn = qp->s_psn - 1;
466 	}
467 
468 	/* Send a request. */
469 	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
470 check_s_state:
471 	switch (qp->s_state) {
472 	default:
473 		if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
474 			goto bail;
475 		/*
476 		 * Resend an old request or start a new one.
477 		 *
478 		 * We keep track of the current SWQE so that
479 		 * we don't reset the "furthest progress" state
480 		 * if we need to back up.
481 		 */
482 		newreq = 0;
483 		if (qp->s_cur == qp->s_tail) {
484 			/* Check if send work queue is empty. */
485 			if (qp->s_tail == READ_ONCE(qp->s_head)) {
486 				clear_ahg(qp);
487 				goto bail;
488 			}
489 			/*
490 			 * If a fence is requested, wait for previous
491 			 * RDMA read and atomic operations to finish.
492 			 * However, there is no need to guard against
493 			 * TID RDMA READ after TID RDMA READ.
494 			 */
495 			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
496 			    qp->s_num_rd_atomic &&
497 			    (wqe->wr.opcode != IB_WR_TID_RDMA_READ ||
498 			     priv->pending_tid_r_segs < qp->s_num_rd_atomic)) {
499 				qp->s_flags |= RVT_S_WAIT_FENCE;
500 				goto bail;
501 			}
502 			/*
503 			 * Local operations are processed immediately
504 			 * after all prior requests have completed
505 			 */
506 			if (wqe->wr.opcode == IB_WR_REG_MR ||
507 			    wqe->wr.opcode == IB_WR_LOCAL_INV) {
508 				int local_ops = 0;
509 				int err = 0;
510 
511 				if (qp->s_last != qp->s_cur)
512 					goto bail;
513 				if (++qp->s_cur == qp->s_size)
514 					qp->s_cur = 0;
515 				if (++qp->s_tail == qp->s_size)
516 					qp->s_tail = 0;
517 				if (!(wqe->wr.send_flags &
518 				      RVT_SEND_COMPLETION_ONLY)) {
519 					err = rvt_invalidate_rkey(
520 						qp,
521 						wqe->wr.ex.invalidate_rkey);
522 					local_ops = 1;
523 				}
524 				rvt_send_complete(qp, wqe,
525 						  err ? IB_WC_LOC_PROT_ERR
526 						      : IB_WC_SUCCESS);
527 				if (local_ops)
528 					atomic_dec(&qp->local_ops_pending);
529 				goto done_free_tx;
530 			}
531 
532 			newreq = 1;
533 			qp->s_psn = wqe->psn;
534 		}
535 		/*
536 		 * Note that we have to be careful not to modify the
537 		 * original work request since we may need to resend
538 		 * it.
539 		 */
540 		len = wqe->length;
541 		ss = &qp->s_sge;
542 		bth2 = mask_psn(qp->s_psn);
543 
544 		/*
545 		 * Interlock between various IB requests and TID RDMA
546 		 * if necessary.
547 		 */
548 		if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) ||
549 		    hfi1_tid_rdma_wqe_interlock(qp, wqe))
550 			goto bail;
551 
552 		switch (wqe->wr.opcode) {
553 		case IB_WR_SEND:
554 		case IB_WR_SEND_WITH_IMM:
555 		case IB_WR_SEND_WITH_INV:
556 			/* If no credit, return. */
557 			if (!rvt_rc_credit_avail(qp, wqe))
558 				goto bail;
559 			if (len > pmtu) {
560 				qp->s_state = OP(SEND_FIRST);
561 				len = pmtu;
562 				break;
563 			}
564 			if (wqe->wr.opcode == IB_WR_SEND) {
565 				qp->s_state = OP(SEND_ONLY);
566 			} else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
567 				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
568 				/* Immediate data comes after the BTH */
569 				ohdr->u.imm_data = wqe->wr.ex.imm_data;
570 				hwords += 1;
571 			} else {
572 				qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE);
573 				/* Invalidate rkey comes after the BTH */
574 				ohdr->u.ieth = cpu_to_be32(
575 						wqe->wr.ex.invalidate_rkey);
576 				hwords += 1;
577 			}
578 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
579 				bth0 |= IB_BTH_SOLICITED;
580 			bth2 |= IB_BTH_REQ_ACK;
581 			if (++qp->s_cur == qp->s_size)
582 				qp->s_cur = 0;
583 			break;
584 
585 		case IB_WR_RDMA_WRITE:
586 			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
587 				qp->s_lsn++;
588 			goto no_flow_control;
589 		case IB_WR_RDMA_WRITE_WITH_IMM:
590 			/* If no credit, return. */
591 			if (!rvt_rc_credit_avail(qp, wqe))
592 				goto bail;
593 no_flow_control:
594 			put_ib_reth_vaddr(
595 				wqe->rdma_wr.remote_addr,
596 				&ohdr->u.rc.reth);
597 			ohdr->u.rc.reth.rkey =
598 				cpu_to_be32(wqe->rdma_wr.rkey);
599 			ohdr->u.rc.reth.length = cpu_to_be32(len);
600 			hwords += sizeof(struct ib_reth) / sizeof(u32);
601 			if (len > pmtu) {
602 				qp->s_state = OP(RDMA_WRITE_FIRST);
603 				len = pmtu;
604 				break;
605 			}
606 			if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
607 				qp->s_state = OP(RDMA_WRITE_ONLY);
608 			} else {
609 				qp->s_state =
610 					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
611 				/* Immediate data comes after RETH */
612 				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
613 				hwords += 1;
614 				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
615 					bth0 |= IB_BTH_SOLICITED;
616 			}
617 			bth2 |= IB_BTH_REQ_ACK;
618 			if (++qp->s_cur == qp->s_size)
619 				qp->s_cur = 0;
620 			break;
621 
622 		case IB_WR_TID_RDMA_WRITE:
623 			if (newreq) {
624 				/*
625 				 * Limit the number of TID RDMA WRITE requests.
626 				 */
627 				if (atomic_read(&priv->n_tid_requests) >=
628 				    HFI1_TID_RDMA_WRITE_CNT)
629 					goto bail;
630 
631 				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
632 					qp->s_lsn++;
633 			}
634 
635 			hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
636 								&bth1, &bth2,
637 								&len);
638 			ss = NULL;
639 			if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
640 				priv->s_tid_cur = qp->s_cur;
641 				if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
642 					priv->s_tid_tail = qp->s_cur;
643 					priv->s_state = TID_OP(WRITE_RESP);
644 				}
645 			} else if (priv->s_tid_cur == priv->s_tid_head) {
646 				struct rvt_swqe *__w;
647 				struct tid_rdma_request *__r;
648 
649 				__w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
650 				__r = wqe_to_tid_req(__w);
651 
652 				/*
653 				 * The s_tid_cur pointer is advanced to s_cur if
654 				 * any of the following conditions about the WQE
655 				 * to which s_ti_cur currently points to are
656 				 * satisfied:
657 				 *   1. The request is not a TID RDMA WRITE
658 				 *      request,
659 				 *   2. The request is in the INACTIVE or
660 				 *      COMPLETE states (TID RDMA READ requests
661 				 *      stay at INACTIVE and TID RDMA WRITE
662 				 *      transition to COMPLETE when done),
663 				 *   3. The request is in the ACTIVE or SYNC
664 				 *      state and the number of completed
665 				 *      segments is equal to the total segment
666 				 *      count.
667 				 *      (If ACTIVE, the request is waiting for
668 				 *       ACKs. If SYNC, the request has not
669 				 *       received any responses because it's
670 				 *       waiting on a sync point.)
671 				 */
672 				if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
673 				    __r->state == TID_REQUEST_INACTIVE ||
674 				    __r->state == TID_REQUEST_COMPLETE ||
675 				    ((__r->state == TID_REQUEST_ACTIVE ||
676 				      __r->state == TID_REQUEST_SYNC) &&
677 				     __r->comp_seg == __r->total_segs)) {
678 					if (priv->s_tid_tail ==
679 					    priv->s_tid_cur &&
680 					    priv->s_state ==
681 					    TID_OP(WRITE_DATA_LAST)) {
682 						priv->s_tid_tail = qp->s_cur;
683 						priv->s_state =
684 							TID_OP(WRITE_RESP);
685 					}
686 					priv->s_tid_cur = qp->s_cur;
687 				}
688 				/*
689 				 * A corner case: when the last TID RDMA WRITE
690 				 * request was completed, s_tid_head,
691 				 * s_tid_cur, and s_tid_tail all point to the
692 				 * same location. Other requests are posted and
693 				 * s_cur wraps around to the same location,
694 				 * where a new TID RDMA WRITE is posted. In
695 				 * this case, none of the indices need to be
696 				 * updated. However, the priv->s_state should.
697 				 */
698 				if (priv->s_tid_tail == qp->s_cur &&
699 				    priv->s_state == TID_OP(WRITE_DATA_LAST))
700 					priv->s_state = TID_OP(WRITE_RESP);
701 			}
702 			req = wqe_to_tid_req(wqe);
703 			if (newreq) {
704 				priv->s_tid_head = qp->s_cur;
705 				priv->pending_tid_w_resp += req->total_segs;
706 				atomic_inc(&priv->n_tid_requests);
707 				atomic_dec(&priv->n_requests);
708 			} else {
709 				req->state = TID_REQUEST_RESEND;
710 				req->comp_seg = delta_psn(bth2, wqe->psn);
711 				/*
712 				 * Pull back any segments since we are going
713 				 * to re-receive them.
714 				 */
715 				req->setup_head = req->clear_tail;
716 				priv->pending_tid_w_resp +=
717 					delta_psn(wqe->lpsn, bth2) + 1;
718 			}
719 
720 			trace_hfi1_tid_write_sender_make_req(qp, newreq);
721 			trace_hfi1_tid_req_make_req_write(qp, newreq,
722 							  wqe->wr.opcode,
723 							  wqe->psn, wqe->lpsn,
724 							  req);
725 			if (++qp->s_cur == qp->s_size)
726 				qp->s_cur = 0;
727 			break;
728 
729 		case IB_WR_RDMA_READ:
730 			/*
731 			 * Don't allow more operations to be started
732 			 * than the QP limits allow.
733 			 */
734 			if (qp->s_num_rd_atomic >=
735 			    qp->s_max_rd_atomic) {
736 				qp->s_flags |= RVT_S_WAIT_RDMAR;
737 				goto bail;
738 			}
739 			qp->s_num_rd_atomic++;
740 			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
741 				qp->s_lsn++;
742 			put_ib_reth_vaddr(
743 				wqe->rdma_wr.remote_addr,
744 				&ohdr->u.rc.reth);
745 			ohdr->u.rc.reth.rkey =
746 				cpu_to_be32(wqe->rdma_wr.rkey);
747 			ohdr->u.rc.reth.length = cpu_to_be32(len);
748 			qp->s_state = OP(RDMA_READ_REQUEST);
749 			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
750 			ss = NULL;
751 			len = 0;
752 			bth2 |= IB_BTH_REQ_ACK;
753 			if (++qp->s_cur == qp->s_size)
754 				qp->s_cur = 0;
755 			break;
756 
757 		case IB_WR_TID_RDMA_READ:
758 			trace_hfi1_tid_read_sender_make_req(qp, newreq);
759 			wpriv = wqe->priv;
760 			req = wqe_to_tid_req(wqe);
761 			trace_hfi1_tid_req_make_req_read(qp, newreq,
762 							 wqe->wr.opcode,
763 							 wqe->psn, wqe->lpsn,
764 							 req);
765 			delta = cmp_psn(qp->s_psn, wqe->psn);
766 
767 			/*
768 			 * Don't allow more operations to be started
769 			 * than the QP limits allow. We could get here under
770 			 * three conditions; (1) It's a new request; (2) We are
771 			 * sending the second or later segment of a request,
772 			 * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
773 			 * when the last segment of a previous request is
774 			 * received just before this; (3) We are re-sending a
775 			 * request.
776 			 */
777 			if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
778 				qp->s_flags |= RVT_S_WAIT_RDMAR;
779 				goto bail;
780 			}
781 			if (newreq) {
782 				struct tid_rdma_flow *flow =
783 					&req->flows[req->setup_head];
784 
785 				/*
786 				 * Set up s_sge as it is needed for TID
787 				 * allocation. However, if the pages have been
788 				 * walked and mapped, skip it. An earlier try
789 				 * has failed to allocate the TID entries.
790 				 */
791 				if (!flow->npagesets) {
792 					qp->s_sge.sge = wqe->sg_list[0];
793 					qp->s_sge.sg_list = wqe->sg_list + 1;
794 					qp->s_sge.num_sge = wqe->wr.num_sge;
795 					qp->s_sge.total_len = wqe->length;
796 					qp->s_len = wqe->length;
797 					req->isge = 0;
798 					req->clear_tail = req->setup_head;
799 					req->flow_idx = req->setup_head;
800 					req->state = TID_REQUEST_ACTIVE;
801 				}
802 			} else if (delta == 0) {
803 				/* Re-send a request */
804 				req->cur_seg = 0;
805 				req->comp_seg = 0;
806 				req->ack_pending = 0;
807 				req->flow_idx = req->clear_tail;
808 				req->state = TID_REQUEST_RESEND;
809 			}
810 			req->s_next_psn = qp->s_psn;
811 			/* Read one segment at a time */
812 			len = min_t(u32, req->seg_len,
813 				    wqe->length - req->seg_len * req->cur_seg);
814 			delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr,
815 							     &bth1, &bth2,
816 							     &len);
817 			if (delta <= 0) {
818 				/* Wait for TID space */
819 				goto bail;
820 			}
821 			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
822 				qp->s_lsn++;
823 			hwords += delta;
824 			ss = &wpriv->ss;
825 			/* Check if this is the last segment */
826 			if (req->cur_seg >= req->total_segs &&
827 			    ++qp->s_cur == qp->s_size)
828 				qp->s_cur = 0;
829 			break;
830 
831 		case IB_WR_ATOMIC_CMP_AND_SWP:
832 		case IB_WR_ATOMIC_FETCH_AND_ADD:
833 			/*
834 			 * Don't allow more operations to be started
835 			 * than the QP limits allow.
836 			 */
837 			if (qp->s_num_rd_atomic >=
838 			    qp->s_max_rd_atomic) {
839 				qp->s_flags |= RVT_S_WAIT_RDMAR;
840 				goto bail;
841 			}
842 			qp->s_num_rd_atomic++;
843 			fallthrough;
844 		case IB_WR_OPFN:
845 			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
846 				qp->s_lsn++;
847 			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
848 			    wqe->wr.opcode == IB_WR_OPFN) {
849 				qp->s_state = OP(COMPARE_SWAP);
850 				put_ib_ateth_swap(wqe->atomic_wr.swap,
851 						  &ohdr->u.atomic_eth);
852 				put_ib_ateth_compare(wqe->atomic_wr.compare_add,
853 						     &ohdr->u.atomic_eth);
854 			} else {
855 				qp->s_state = OP(FETCH_ADD);
856 				put_ib_ateth_swap(wqe->atomic_wr.compare_add,
857 						  &ohdr->u.atomic_eth);
858 				put_ib_ateth_compare(0, &ohdr->u.atomic_eth);
859 			}
860 			put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
861 					   &ohdr->u.atomic_eth);
862 			ohdr->u.atomic_eth.rkey = cpu_to_be32(
863 				wqe->atomic_wr.rkey);
864 			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
865 			ss = NULL;
866 			len = 0;
867 			bth2 |= IB_BTH_REQ_ACK;
868 			if (++qp->s_cur == qp->s_size)
869 				qp->s_cur = 0;
870 			break;
871 
872 		default:
873 			goto bail;
874 		}
875 		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) {
876 			qp->s_sge.sge = wqe->sg_list[0];
877 			qp->s_sge.sg_list = wqe->sg_list + 1;
878 			qp->s_sge.num_sge = wqe->wr.num_sge;
879 			qp->s_sge.total_len = wqe->length;
880 			qp->s_len = wqe->length;
881 		}
882 		if (newreq) {
883 			qp->s_tail++;
884 			if (qp->s_tail >= qp->s_size)
885 				qp->s_tail = 0;
886 		}
887 		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
888 		    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
889 			qp->s_psn = wqe->lpsn + 1;
890 		else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
891 			qp->s_psn = req->s_next_psn;
892 		else
893 			qp->s_psn++;
894 		break;
895 
896 	case OP(RDMA_READ_RESPONSE_FIRST):
897 		/*
898 		 * qp->s_state is normally set to the opcode of the
899 		 * last packet constructed for new requests and therefore
900 		 * is never set to RDMA read response.
901 		 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
902 		 * thread to indicate a SEND needs to be restarted from an
903 		 * earlier PSN without interfering with the sending thread.
904 		 * See restart_rc().
905 		 */
906 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
907 		fallthrough;
908 	case OP(SEND_FIRST):
909 		qp->s_state = OP(SEND_MIDDLE);
910 		fallthrough;
911 	case OP(SEND_MIDDLE):
912 		bth2 = mask_psn(qp->s_psn++);
913 		ss = &qp->s_sge;
914 		len = qp->s_len;
915 		if (len > pmtu) {
916 			len = pmtu;
917 			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
918 			break;
919 		}
920 		if (wqe->wr.opcode == IB_WR_SEND) {
921 			qp->s_state = OP(SEND_LAST);
922 		} else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
923 			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
924 			/* Immediate data comes after the BTH */
925 			ohdr->u.imm_data = wqe->wr.ex.imm_data;
926 			hwords += 1;
927 		} else {
928 			qp->s_state = OP(SEND_LAST_WITH_INVALIDATE);
929 			/* invalidate data comes after the BTH */
930 			ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey);
931 			hwords += 1;
932 		}
933 		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
934 			bth0 |= IB_BTH_SOLICITED;
935 		bth2 |= IB_BTH_REQ_ACK;
936 		qp->s_cur++;
937 		if (qp->s_cur >= qp->s_size)
938 			qp->s_cur = 0;
939 		break;
940 
941 	case OP(RDMA_READ_RESPONSE_LAST):
942 		/*
943 		 * qp->s_state is normally set to the opcode of the
944 		 * last packet constructed for new requests and therefore
945 		 * is never set to RDMA read response.
946 		 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
947 		 * thread to indicate a RDMA write needs to be restarted from
948 		 * an earlier PSN without interfering with the sending thread.
949 		 * See restart_rc().
950 		 */
951 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
952 		fallthrough;
953 	case OP(RDMA_WRITE_FIRST):
954 		qp->s_state = OP(RDMA_WRITE_MIDDLE);
955 		fallthrough;
956 	case OP(RDMA_WRITE_MIDDLE):
957 		bth2 = mask_psn(qp->s_psn++);
958 		ss = &qp->s_sge;
959 		len = qp->s_len;
960 		if (len > pmtu) {
961 			len = pmtu;
962 			middle = HFI1_CAP_IS_KSET(SDMA_AHG);
963 			break;
964 		}
965 		if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
966 			qp->s_state = OP(RDMA_WRITE_LAST);
967 		} else {
968 			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
969 			/* Immediate data comes after the BTH */
970 			ohdr->u.imm_data = wqe->wr.ex.imm_data;
971 			hwords += 1;
972 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
973 				bth0 |= IB_BTH_SOLICITED;
974 		}
975 		bth2 |= IB_BTH_REQ_ACK;
976 		qp->s_cur++;
977 		if (qp->s_cur >= qp->s_size)
978 			qp->s_cur = 0;
979 		break;
980 
981 	case OP(RDMA_READ_RESPONSE_MIDDLE):
982 		/*
983 		 * qp->s_state is normally set to the opcode of the
984 		 * last packet constructed for new requests and therefore
985 		 * is never set to RDMA read response.
986 		 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
987 		 * thread to indicate a RDMA read needs to be restarted from
988 		 * an earlier PSN without interfering with the sending thread.
989 		 * See restart_rc().
990 		 */
991 		len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
992 		put_ib_reth_vaddr(
993 			wqe->rdma_wr.remote_addr + len,
994 			&ohdr->u.rc.reth);
995 		ohdr->u.rc.reth.rkey =
996 			cpu_to_be32(wqe->rdma_wr.rkey);
997 		ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
998 		qp->s_state = OP(RDMA_READ_REQUEST);
999 		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
1000 		bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
1001 		qp->s_psn = wqe->lpsn + 1;
1002 		ss = NULL;
1003 		len = 0;
1004 		qp->s_cur++;
1005 		if (qp->s_cur == qp->s_size)
1006 			qp->s_cur = 0;
1007 		break;
1008 
1009 	case TID_OP(WRITE_RESP):
1010 		/*
1011 		 * This value for s_state is used for restarting a TID RDMA
1012 		 * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
1013 		 * for more).
1014 		 */
1015 		req = wqe_to_tid_req(wqe);
1016 		req->state = TID_REQUEST_RESEND;
1017 		rcu_read_lock();
1018 		remote = rcu_dereference(priv->tid_rdma.remote);
1019 		req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
1020 		len = wqe->length - (req->comp_seg * remote->max_len);
1021 		rcu_read_unlock();
1022 
1023 		bth2 = mask_psn(qp->s_psn);
1024 		hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
1025 							&bth2, &len);
1026 		qp->s_psn = wqe->lpsn + 1;
1027 		ss = NULL;
1028 		qp->s_state = TID_OP(WRITE_REQ);
1029 		priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
1030 		priv->s_tid_cur = qp->s_cur;
1031 		if (++qp->s_cur == qp->s_size)
1032 			qp->s_cur = 0;
1033 		trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode,
1034 						  wqe->psn, wqe->lpsn, req);
1035 		break;
1036 
1037 	case TID_OP(READ_RESP):
1038 		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
1039 			goto bail;
1040 		/* This is used to restart a TID read request */
1041 		req = wqe_to_tid_req(wqe);
1042 		wpriv = wqe->priv;
1043 		/*
1044 		 * Back down. The field qp->s_psn has been set to the psn with
1045 		 * which the request should be restart. It's OK to use division
1046 		 * as this is on the retry path.
1047 		 */
1048 		req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps;
1049 
1050 		/*
1051 		 * The following function need to be redefined to return the
1052 		 * status to make sure that we find the flow. At the same
1053 		 * time, we can use the req->state change to check if the
1054 		 * call succeeds or not.
1055 		 */
1056 		req->state = TID_REQUEST_RESEND;
1057 		hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
1058 		if (req->state != TID_REQUEST_ACTIVE) {
1059 			/*
1060 			 * Failed to find the flow. Release all allocated tid
1061 			 * resources.
1062 			 */
1063 			hfi1_kern_exp_rcv_clear_all(req);
1064 			hfi1_kern_clear_hw_flow(priv->rcd, qp);
1065 
1066 			hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR);
1067 			goto bail;
1068 		}
1069 		req->state = TID_REQUEST_RESEND;
1070 		len = min_t(u32, req->seg_len,
1071 			    wqe->length - req->seg_len * req->cur_seg);
1072 		flow = &req->flows[req->flow_idx];
1073 		len -= flow->sent;
1074 		req->s_next_psn = flow->flow_state.ib_lpsn + 1;
1075 		delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1,
1076 							&bth2, &len);
1077 		if (delta <= 0) {
1078 			/* Wait for TID space */
1079 			goto bail;
1080 		}
1081 		hwords += delta;
1082 		ss = &wpriv->ss;
1083 		/* Check if this is the last segment */
1084 		if (req->cur_seg >= req->total_segs &&
1085 		    ++qp->s_cur == qp->s_size)
1086 			qp->s_cur = 0;
1087 		qp->s_psn = req->s_next_psn;
1088 		trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
1089 						 wqe->psn, wqe->lpsn, req);
1090 		break;
1091 	case TID_OP(READ_REQ):
1092 		req = wqe_to_tid_req(wqe);
1093 		delta = cmp_psn(qp->s_psn, wqe->psn);
1094 		/*
1095 		 * If the current WR is not TID RDMA READ, or this is the start
1096 		 * of a new request, we need to change the qp->s_state so that
1097 		 * the request can be set up properly.
1098 		 */
1099 		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 ||
1100 		    qp->s_cur == qp->s_tail) {
1101 			qp->s_state = OP(RDMA_READ_REQUEST);
1102 			if (delta == 0 || qp->s_cur == qp->s_tail)
1103 				goto check_s_state;
1104 			else
1105 				goto bail;
1106 		}
1107 
1108 		/* Rate limiting */
1109 		if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
1110 			qp->s_flags |= RVT_S_WAIT_RDMAR;
1111 			goto bail;
1112 		}
1113 
1114 		wpriv = wqe->priv;
1115 		/* Read one segment at a time */
1116 		len = min_t(u32, req->seg_len,
1117 			    wqe->length - req->seg_len * req->cur_seg);
1118 		delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1,
1119 						     &bth2, &len);
1120 		if (delta <= 0) {
1121 			/* Wait for TID space */
1122 			goto bail;
1123 		}
1124 		hwords += delta;
1125 		ss = &wpriv->ss;
1126 		/* Check if this is the last segment */
1127 		if (req->cur_seg >= req->total_segs &&
1128 		    ++qp->s_cur == qp->s_size)
1129 			qp->s_cur = 0;
1130 		qp->s_psn = req->s_next_psn;
1131 		trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
1132 						 wqe->psn, wqe->lpsn, req);
1133 		break;
1134 	}
1135 	qp->s_sending_hpsn = bth2;
1136 	delta = delta_psn(bth2, wqe->psn);
1137 	if (delta && delta % HFI1_PSN_CREDIT == 0 &&
1138 	    wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
1139 		bth2 |= IB_BTH_REQ_ACK;
1140 	if (qp->s_flags & RVT_S_SEND_ONE) {
1141 		qp->s_flags &= ~RVT_S_SEND_ONE;
1142 		qp->s_flags |= RVT_S_WAIT_ACK;
1143 		bth2 |= IB_BTH_REQ_ACK;
1144 	}
1145 	qp->s_len -= len;
1146 	ps->s_txreq->hdr_dwords = hwords;
1147 	ps->s_txreq->sde = priv->s_sde;
1148 	ps->s_txreq->ss = ss;
1149 	ps->s_txreq->s_cur_size = len;
1150 	hfi1_make_ruc_header(
1151 		qp,
1152 		ohdr,
1153 		bth0 | (qp->s_state << 24),
1154 		bth1,
1155 		bth2,
1156 		middle,
1157 		ps);
1158 	return 1;
1159 
1160 done_free_tx:
1161 	hfi1_put_txreq(ps->s_txreq);
1162 	ps->s_txreq = NULL;
1163 	return 1;
1164 
1165 bail:
1166 	hfi1_put_txreq(ps->s_txreq);
1167 
1168 bail_no_tx:
1169 	ps->s_txreq = NULL;
1170 	qp->s_flags &= ~RVT_S_BUSY;
1171 	/*
1172 	 * If we didn't get a txreq, the QP will be woken up later to try
1173 	 * again. Set the flags to indicate which work item to wake
1174 	 * up.
1175 	 */
1176 	iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
1177 	return 0;
1178 }
1179 
hfi1_make_bth_aeth(struct rvt_qp * qp,struct ib_other_headers * ohdr,u32 bth0,u32 bth1)1180 static inline void hfi1_make_bth_aeth(struct rvt_qp *qp,
1181 				      struct ib_other_headers *ohdr,
1182 				      u32 bth0, u32 bth1)
1183 {
1184 	if (qp->r_nak_state)
1185 		ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
1186 					    (qp->r_nak_state <<
1187 					     IB_AETH_CREDIT_SHIFT));
1188 	else
1189 		ohdr->u.aeth = rvt_compute_aeth(qp);
1190 
1191 	ohdr->bth[0] = cpu_to_be32(bth0);
1192 	ohdr->bth[1] = cpu_to_be32(bth1 | qp->remote_qpn);
1193 	ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
1194 }
1195 
hfi1_queue_rc_ack(struct hfi1_packet * packet,bool is_fecn)1196 static inline void hfi1_queue_rc_ack(struct hfi1_packet *packet, bool is_fecn)
1197 {
1198 	struct rvt_qp *qp = packet->qp;
1199 	struct hfi1_ibport *ibp;
1200 	unsigned long flags;
1201 
1202 	spin_lock_irqsave(&qp->s_lock, flags);
1203 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
1204 		goto unlock;
1205 	ibp = rcd_to_iport(packet->rcd);
1206 	this_cpu_inc(*ibp->rvp.rc_qacks);
1207 	qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
1208 	qp->s_nak_state = qp->r_nak_state;
1209 	qp->s_ack_psn = qp->r_ack_psn;
1210 	if (is_fecn)
1211 		qp->s_flags |= RVT_S_ECN;
1212 
1213 	/* Schedule the send tasklet. */
1214 	hfi1_schedule_send(qp);
1215 unlock:
1216 	spin_unlock_irqrestore(&qp->s_lock, flags);
1217 }
1218 
hfi1_make_rc_ack_9B(struct hfi1_packet * packet,struct hfi1_opa_header * opa_hdr,u8 sc5,bool is_fecn,u64 * pbc_flags,u32 * hwords,u32 * nwords)1219 static inline void hfi1_make_rc_ack_9B(struct hfi1_packet *packet,
1220 				       struct hfi1_opa_header *opa_hdr,
1221 				       u8 sc5, bool is_fecn,
1222 				       u64 *pbc_flags, u32 *hwords,
1223 				       u32 *nwords)
1224 {
1225 	struct rvt_qp *qp = packet->qp;
1226 	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
1227 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1228 	struct ib_header *hdr = &opa_hdr->ibh;
1229 	struct ib_other_headers *ohdr;
1230 	u16 lrh0 = HFI1_LRH_BTH;
1231 	u16 pkey;
1232 	u32 bth0, bth1;
1233 
1234 	opa_hdr->hdr_type = HFI1_PKT_TYPE_9B;
1235 	ohdr = &hdr->u.oth;
1236 	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
1237 	*hwords = 6;
1238 
1239 	if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) {
1240 		*hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
1241 					 rdma_ah_read_grh(&qp->remote_ah_attr),
1242 					 *hwords - 2, SIZE_OF_CRC);
1243 		ohdr = &hdr->u.l.oth;
1244 		lrh0 = HFI1_LRH_GRH;
1245 	}
1246 	/* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
1247 	*pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
1248 
1249 	/* read pkey_index w/o lock (its atomic) */
1250 	pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
1251 
1252 	lrh0 |= (sc5 & IB_SC_MASK) << IB_SC_SHIFT |
1253 		(rdma_ah_get_sl(&qp->remote_ah_attr) & IB_SL_MASK) <<
1254 			IB_SL_SHIFT;
1255 
1256 	hfi1_make_ib_hdr(hdr, lrh0, *hwords + SIZE_OF_CRC,
1257 			 opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), 9B),
1258 			 ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr));
1259 
1260 	bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
1261 	if (qp->s_mig_state == IB_MIG_MIGRATED)
1262 		bth0 |= IB_BTH_MIG_REQ;
1263 	bth1 = (!!is_fecn) << IB_BECN_SHIFT;
1264 	/*
1265 	 * Inline ACKs go out without the use of the Verbs send engine, so
1266 	 * we need to set the STL Verbs Extended bit here
1267 	 */
1268 	bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT;
1269 	hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
1270 }
1271 
hfi1_make_rc_ack_16B(struct hfi1_packet * packet,struct hfi1_opa_header * opa_hdr,u8 sc5,bool is_fecn,u64 * pbc_flags,u32 * hwords,u32 * nwords)1272 static inline void hfi1_make_rc_ack_16B(struct hfi1_packet *packet,
1273 					struct hfi1_opa_header *opa_hdr,
1274 					u8 sc5, bool is_fecn,
1275 					u64 *pbc_flags, u32 *hwords,
1276 					u32 *nwords)
1277 {
1278 	struct rvt_qp *qp = packet->qp;
1279 	struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
1280 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1281 	struct hfi1_16b_header *hdr = &opa_hdr->opah;
1282 	struct ib_other_headers *ohdr;
1283 	u32 bth0, bth1 = 0;
1284 	u16 len, pkey;
1285 	bool becn = is_fecn;
1286 	u8 l4 = OPA_16B_L4_IB_LOCAL;
1287 	u8 extra_bytes;
1288 
1289 	opa_hdr->hdr_type = HFI1_PKT_TYPE_16B;
1290 	ohdr = &hdr->u.oth;
1291 	/* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */
1292 	*hwords = 8;
1293 	extra_bytes = hfi1_get_16b_padding(*hwords << 2, 0);
1294 	*nwords = SIZE_OF_CRC + ((extra_bytes + SIZE_OF_LT) >> 2);
1295 
1296 	if (unlikely(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH) &&
1297 	    hfi1_check_mcast(rdma_ah_get_dlid(&qp->remote_ah_attr))) {
1298 		*hwords += hfi1_make_grh(ibp, &hdr->u.l.grh,
1299 					 rdma_ah_read_grh(&qp->remote_ah_attr),
1300 					 *hwords - 4, *nwords);
1301 		ohdr = &hdr->u.l.oth;
1302 		l4 = OPA_16B_L4_IB_GLOBAL;
1303 	}
1304 	*pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
1305 
1306 	/* read pkey_index w/o lock (its atomic) */
1307 	pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
1308 
1309 	/* Convert dwords to flits */
1310 	len = (*hwords + *nwords) >> 1;
1311 
1312 	hfi1_make_16b_hdr(hdr, ppd->lid |
1313 			  (rdma_ah_get_path_bits(&qp->remote_ah_attr) &
1314 			  ((1 << ppd->lmc) - 1)),
1315 			  opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr),
1316 				      16B), len, pkey, becn, 0, l4, sc5);
1317 
1318 	bth0 = pkey | (OP(ACKNOWLEDGE) << 24);
1319 	bth0 |= extra_bytes << 20;
1320 	if (qp->s_mig_state == IB_MIG_MIGRATED)
1321 		bth1 = OPA_BTH_MIG_REQ;
1322 	hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
1323 }
1324 
1325 typedef void (*hfi1_make_rc_ack)(struct hfi1_packet *packet,
1326 				 struct hfi1_opa_header *opa_hdr,
1327 				 u8 sc5, bool is_fecn,
1328 				 u64 *pbc_flags, u32 *hwords,
1329 				 u32 *nwords);
1330 
1331 /* We support only two types - 9B and 16B for now */
1332 static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl[2] = {
1333 	[HFI1_PKT_TYPE_9B] = &hfi1_make_rc_ack_9B,
1334 	[HFI1_PKT_TYPE_16B] = &hfi1_make_rc_ack_16B
1335 };
1336 
1337 /*
1338  * hfi1_send_rc_ack - Construct an ACK packet and send it
1339  *
1340  * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
1341  * Note that RDMA reads and atomics are handled in the
1342  * send side QP state and send engine.
1343  */
hfi1_send_rc_ack(struct hfi1_packet * packet,bool is_fecn)1344 void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn)
1345 {
1346 	struct hfi1_ctxtdata *rcd = packet->rcd;
1347 	struct rvt_qp *qp = packet->qp;
1348 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
1349 	struct hfi1_qp_priv *priv = qp->priv;
1350 	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1351 	u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&qp->remote_ah_attr)];
1352 	u64 pbc, pbc_flags = 0;
1353 	u32 hwords = 0;
1354 	u32 nwords = 0;
1355 	u32 plen;
1356 	struct pio_buf *pbuf;
1357 	struct hfi1_opa_header opa_hdr;
1358 
1359 	/* clear the defer count */
1360 	qp->r_adefered = 0;
1361 
1362 	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
1363 	if (qp->s_flags & RVT_S_RESP_PENDING) {
1364 		hfi1_queue_rc_ack(packet, is_fecn);
1365 		return;
1366 	}
1367 
1368 	/* Ensure s_rdma_ack_cnt changes are committed */
1369 	if (qp->s_rdma_ack_cnt) {
1370 		hfi1_queue_rc_ack(packet, is_fecn);
1371 		return;
1372 	}
1373 
1374 	/* Don't try to send ACKs if the link isn't ACTIVE */
1375 	if (driver_lstate(ppd) != IB_PORT_ACTIVE)
1376 		return;
1377 
1378 	/* Make the appropriate header */
1379 	hfi1_make_rc_ack_tbl[priv->hdr_type](packet, &opa_hdr, sc5, is_fecn,
1380 					     &pbc_flags, &hwords, &nwords);
1381 
1382 	plen = 2 /* PBC */ + hwords + nwords;
1383 	pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps,
1384 			 sc_to_vlt(ppd->dd, sc5), plen);
1385 	pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL);
1386 	if (IS_ERR_OR_NULL(pbuf)) {
1387 		/*
1388 		 * We have no room to send at the moment.  Pass
1389 		 * responsibility for sending the ACK to the send engine
1390 		 * so that when enough buffer space becomes available,
1391 		 * the ACK is sent ahead of other outgoing packets.
1392 		 */
1393 		hfi1_queue_rc_ack(packet, is_fecn);
1394 		return;
1395 	}
1396 	trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
1397 			       &opa_hdr, ib_is_sc5(sc5));
1398 
1399 	/* write the pbc and data */
1400 	ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
1401 				 (priv->hdr_type == HFI1_PKT_TYPE_9B ?
1402 				 (void *)&opa_hdr.ibh :
1403 				 (void *)&opa_hdr.opah), hwords);
1404 	return;
1405 }
1406 
1407 /**
1408  * update_num_rd_atomic - update the qp->s_num_rd_atomic
1409  * @qp: the QP
1410  * @psn: the packet sequence number to restart at
1411  * @wqe: the wqe
1412  *
1413  * This is called from reset_psn() to update qp->s_num_rd_atomic
1414  * for the current wqe.
1415  * Called at interrupt level with the QP s_lock held.
1416  */
update_num_rd_atomic(struct rvt_qp * qp,u32 psn,struct rvt_swqe * wqe)1417 static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn,
1418 				 struct rvt_swqe *wqe)
1419 {
1420 	u32 opcode = wqe->wr.opcode;
1421 
1422 	if (opcode == IB_WR_RDMA_READ ||
1423 	    opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1424 	    opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1425 		qp->s_num_rd_atomic++;
1426 	} else if (opcode == IB_WR_TID_RDMA_READ) {
1427 		struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1428 		struct hfi1_qp_priv *priv = qp->priv;
1429 
1430 		if (cmp_psn(psn, wqe->lpsn) <= 0) {
1431 			u32 cur_seg;
1432 
1433 			cur_seg = (psn - wqe->psn) / priv->pkts_ps;
1434 			req->ack_pending = cur_seg - req->comp_seg;
1435 			priv->pending_tid_r_segs += req->ack_pending;
1436 			qp->s_num_rd_atomic += req->ack_pending;
1437 			trace_hfi1_tid_req_update_num_rd_atomic(qp, 0,
1438 								wqe->wr.opcode,
1439 								wqe->psn,
1440 								wqe->lpsn,
1441 								req);
1442 		} else {
1443 			priv->pending_tid_r_segs += req->total_segs;
1444 			qp->s_num_rd_atomic += req->total_segs;
1445 		}
1446 	}
1447 }
1448 
1449 /**
1450  * reset_psn - reset the QP state to send starting from PSN
1451  * @qp: the QP
1452  * @psn: the packet sequence number to restart at
1453  *
1454  * This is called from hfi1_rc_rcv() to process an incoming RC ACK
1455  * for the given QP.
1456  * Called at interrupt level with the QP s_lock held.
1457  */
reset_psn(struct rvt_qp * qp,u32 psn)1458 static void reset_psn(struct rvt_qp *qp, u32 psn)
1459 {
1460 	u32 n = qp->s_acked;
1461 	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
1462 	u32 opcode;
1463 	struct hfi1_qp_priv *priv = qp->priv;
1464 
1465 	lockdep_assert_held(&qp->s_lock);
1466 	qp->s_cur = n;
1467 	priv->pending_tid_r_segs = 0;
1468 	priv->pending_tid_w_resp = 0;
1469 	qp->s_num_rd_atomic = 0;
1470 
1471 	/*
1472 	 * If we are starting the request from the beginning,
1473 	 * let the normal send code handle initialization.
1474 	 */
1475 	if (cmp_psn(psn, wqe->psn) <= 0) {
1476 		qp->s_state = OP(SEND_LAST);
1477 		goto done;
1478 	}
1479 	update_num_rd_atomic(qp, psn, wqe);
1480 
1481 	/* Find the work request opcode corresponding to the given PSN. */
1482 	for (;;) {
1483 		int diff;
1484 
1485 		if (++n == qp->s_size)
1486 			n = 0;
1487 		if (n == qp->s_tail)
1488 			break;
1489 		wqe = rvt_get_swqe_ptr(qp, n);
1490 		diff = cmp_psn(psn, wqe->psn);
1491 		if (diff < 0) {
1492 			/* Point wqe back to the previous one*/
1493 			wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
1494 			break;
1495 		}
1496 		qp->s_cur = n;
1497 		/*
1498 		 * If we are starting the request from the beginning,
1499 		 * let the normal send code handle initialization.
1500 		 */
1501 		if (diff == 0) {
1502 			qp->s_state = OP(SEND_LAST);
1503 			goto done;
1504 		}
1505 
1506 		update_num_rd_atomic(qp, psn, wqe);
1507 	}
1508 	opcode = wqe->wr.opcode;
1509 
1510 	/*
1511 	 * Set the state to restart in the middle of a request.
1512 	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
1513 	 * See hfi1_make_rc_req().
1514 	 */
1515 	switch (opcode) {
1516 	case IB_WR_SEND:
1517 	case IB_WR_SEND_WITH_IMM:
1518 		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
1519 		break;
1520 
1521 	case IB_WR_RDMA_WRITE:
1522 	case IB_WR_RDMA_WRITE_WITH_IMM:
1523 		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
1524 		break;
1525 
1526 	case IB_WR_TID_RDMA_WRITE:
1527 		qp->s_state = TID_OP(WRITE_RESP);
1528 		break;
1529 
1530 	case IB_WR_RDMA_READ:
1531 		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
1532 		break;
1533 
1534 	case IB_WR_TID_RDMA_READ:
1535 		qp->s_state = TID_OP(READ_RESP);
1536 		break;
1537 
1538 	default:
1539 		/*
1540 		 * This case shouldn't happen since its only
1541 		 * one PSN per req.
1542 		 */
1543 		qp->s_state = OP(SEND_LAST);
1544 	}
1545 done:
1546 	priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
1547 	qp->s_psn = psn;
1548 	/*
1549 	 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
1550 	 * asynchronously before the send engine can get scheduled.
1551 	 * Doing it in hfi1_make_rc_req() is too late.
1552 	 */
1553 	if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
1554 	    (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
1555 		qp->s_flags |= RVT_S_WAIT_PSN;
1556 	qp->s_flags &= ~HFI1_S_AHG_VALID;
1557 	trace_hfi1_sender_reset_psn(qp);
1558 }
1559 
1560 /*
1561  * Back up requester to resend the last un-ACKed request.
1562  * The QP r_lock and s_lock should be held and interrupts disabled.
1563  */
hfi1_restart_rc(struct rvt_qp * qp,u32 psn,int wait)1564 void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
1565 {
1566 	struct hfi1_qp_priv *priv = qp->priv;
1567 	struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1568 	struct hfi1_ibport *ibp;
1569 
1570 	lockdep_assert_held(&qp->r_lock);
1571 	lockdep_assert_held(&qp->s_lock);
1572 	trace_hfi1_sender_restart_rc(qp);
1573 	if (qp->s_retry == 0) {
1574 		if (qp->s_mig_state == IB_MIG_ARMED) {
1575 			hfi1_migrate_qp(qp);
1576 			qp->s_retry = qp->s_retry_cnt;
1577 		} else if (qp->s_last == qp->s_acked) {
1578 			/*
1579 			 * We need special handling for the OPFN request WQEs as
1580 			 * they are not allowed to generate real user errors
1581 			 */
1582 			if (wqe->wr.opcode == IB_WR_OPFN) {
1583 				struct hfi1_ibport *ibp =
1584 					to_iport(qp->ibqp.device, qp->port_num);
1585 				/*
1586 				 * Call opfn_conn_reply() with capcode and
1587 				 * remaining data as 0 to close out the
1588 				 * current request
1589 				 */
1590 				opfn_conn_reply(qp, priv->opfn.curr);
1591 				wqe = do_rc_completion(qp, wqe, ibp);
1592 				qp->s_flags &= ~RVT_S_WAIT_ACK;
1593 			} else {
1594 				trace_hfi1_tid_write_sender_restart_rc(qp, 0);
1595 				if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
1596 					struct tid_rdma_request *req;
1597 
1598 					req = wqe_to_tid_req(wqe);
1599 					hfi1_kern_exp_rcv_clear_all(req);
1600 					hfi1_kern_clear_hw_flow(priv->rcd, qp);
1601 				}
1602 
1603 				hfi1_trdma_send_complete(qp, wqe,
1604 							 IB_WC_RETRY_EXC_ERR);
1605 				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1606 			}
1607 			return;
1608 		} else { /* need to handle delayed completion */
1609 			return;
1610 		}
1611 	} else {
1612 		qp->s_retry--;
1613 	}
1614 
1615 	ibp = to_iport(qp->ibqp.device, qp->port_num);
1616 	if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1617 	    wqe->wr.opcode == IB_WR_TID_RDMA_READ)
1618 		ibp->rvp.n_rc_resends++;
1619 	else
1620 		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
1621 
1622 	qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
1623 			 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
1624 			 RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
1625 	if (wait)
1626 		qp->s_flags |= RVT_S_SEND_ONE;
1627 	reset_psn(qp, psn);
1628 }
1629 
1630 /*
1631  * Set qp->s_sending_psn to the next PSN after the given one.
1632  * This would be psn+1 except when RDMA reads or TID RDMA ops
1633  * are present.
1634  */
reset_sending_psn(struct rvt_qp * qp,u32 psn)1635 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
1636 {
1637 	struct rvt_swqe *wqe;
1638 	u32 n = qp->s_last;
1639 
1640 	lockdep_assert_held(&qp->s_lock);
1641 	/* Find the work request corresponding to the given PSN. */
1642 	for (;;) {
1643 		wqe = rvt_get_swqe_ptr(qp, n);
1644 		if (cmp_psn(psn, wqe->lpsn) <= 0) {
1645 			if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1646 			    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
1647 			    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
1648 				qp->s_sending_psn = wqe->lpsn + 1;
1649 			else
1650 				qp->s_sending_psn = psn + 1;
1651 			break;
1652 		}
1653 		if (++n == qp->s_size)
1654 			n = 0;
1655 		if (n == qp->s_tail)
1656 			break;
1657 	}
1658 }
1659 
1660 /**
1661  * hfi1_rc_verbs_aborted - handle abort status
1662  * @qp: the QP
1663  * @opah: the opa header
1664  *
1665  * This code modifies both ACK bit in BTH[2]
1666  * and the s_flags to go into send one mode.
1667  *
1668  * This serves to throttle the send engine to only
1669  * send a single packet in the likely case the
1670  * a link has gone down.
1671  */
hfi1_rc_verbs_aborted(struct rvt_qp * qp,struct hfi1_opa_header * opah)1672 void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah)
1673 {
1674 	struct ib_other_headers *ohdr = hfi1_get_rc_ohdr(opah);
1675 	u8 opcode = ib_bth_get_opcode(ohdr);
1676 	u32 psn;
1677 
1678 	/* ignore responses */
1679 	if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1680 	     opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
1681 	    opcode == TID_OP(READ_RESP) ||
1682 	    opcode == TID_OP(WRITE_RESP))
1683 		return;
1684 
1685 	psn = ib_bth_get_psn(ohdr) | IB_BTH_REQ_ACK;
1686 	ohdr->bth[2] = cpu_to_be32(psn);
1687 	qp->s_flags |= RVT_S_SEND_ONE;
1688 }
1689 
1690 /*
1691  * This should be called with the QP s_lock held and interrupts disabled.
1692  */
hfi1_rc_send_complete(struct rvt_qp * qp,struct hfi1_opa_header * opah)1693 void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
1694 {
1695 	struct ib_other_headers *ohdr;
1696 	struct hfi1_qp_priv *priv = qp->priv;
1697 	struct rvt_swqe *wqe;
1698 	u32 opcode, head, tail;
1699 	u32 psn;
1700 	struct tid_rdma_request *req;
1701 
1702 	lockdep_assert_held(&qp->s_lock);
1703 	if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
1704 		return;
1705 
1706 	ohdr = hfi1_get_rc_ohdr(opah);
1707 	opcode = ib_bth_get_opcode(ohdr);
1708 	if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1709 	     opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
1710 	    opcode == TID_OP(READ_RESP) ||
1711 	    opcode == TID_OP(WRITE_RESP)) {
1712 		WARN_ON(!qp->s_rdma_ack_cnt);
1713 		qp->s_rdma_ack_cnt--;
1714 		return;
1715 	}
1716 
1717 	psn = ib_bth_get_psn(ohdr);
1718 	/*
1719 	 * Don't attempt to reset the sending PSN for packets in the
1720 	 * KDETH PSN space since the PSN does not match anything.
1721 	 */
1722 	if (opcode != TID_OP(WRITE_DATA) &&
1723 	    opcode != TID_OP(WRITE_DATA_LAST) &&
1724 	    opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
1725 		reset_sending_psn(qp, psn);
1726 
1727 	/* Handle TID RDMA WRITE packets differently */
1728 	if (opcode >= TID_OP(WRITE_REQ) &&
1729 	    opcode <= TID_OP(WRITE_DATA_LAST)) {
1730 		head = priv->s_tid_head;
1731 		tail = priv->s_tid_cur;
1732 		/*
1733 		 * s_tid_cur is set to s_tid_head in the case, where
1734 		 * a new TID RDMA request is being started and all
1735 		 * previous ones have been completed.
1736 		 * Therefore, we need to do a secondary check in order
1737 		 * to properly determine whether we should start the
1738 		 * RC timer.
1739 		 */
1740 		wqe = rvt_get_swqe_ptr(qp, tail);
1741 		req = wqe_to_tid_req(wqe);
1742 		if (head == tail && req->comp_seg < req->total_segs) {
1743 			if (tail == 0)
1744 				tail = qp->s_size - 1;
1745 			else
1746 				tail -= 1;
1747 		}
1748 	} else {
1749 		head = qp->s_tail;
1750 		tail = qp->s_acked;
1751 	}
1752 
1753 	/*
1754 	 * Start timer after a packet requesting an ACK has been sent and
1755 	 * there are still requests that haven't been acked.
1756 	 */
1757 	if ((psn & IB_BTH_REQ_ACK) && tail != head &&
1758 	    opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
1759 	    opcode != TID_OP(RESYNC) &&
1760 	    !(qp->s_flags &
1761 	      (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
1762 	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
1763 		if (opcode == TID_OP(READ_REQ))
1764 			rvt_add_retry_timer_ext(qp, priv->timeout_shift);
1765 		else
1766 			rvt_add_retry_timer(qp);
1767 	}
1768 
1769 	/* Start TID RDMA ACK timer */
1770 	if ((opcode == TID_OP(WRITE_DATA) ||
1771 	     opcode == TID_OP(WRITE_DATA_LAST) ||
1772 	     opcode == TID_OP(RESYNC)) &&
1773 	    (psn & IB_BTH_REQ_ACK) &&
1774 	    !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
1775 	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
1776 		/*
1777 		 * The TID RDMA ACK packet could be received before this
1778 		 * function is called. Therefore, add the timer only if TID
1779 		 * RDMA ACK packets are actually pending.
1780 		 */
1781 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1782 		req = wqe_to_tid_req(wqe);
1783 		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
1784 		    req->ack_seg < req->cur_seg)
1785 			hfi1_add_tid_retry_timer(qp);
1786 	}
1787 
1788 	while (qp->s_last != qp->s_acked) {
1789 		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
1790 		if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
1791 		    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
1792 			break;
1793 		trdma_clean_swqe(qp, wqe);
1794 		trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
1795 		rvt_qp_complete_swqe(qp,
1796 				     wqe,
1797 				     ib_hfi1_wc_opcode[wqe->wr.opcode],
1798 				     IB_WC_SUCCESS);
1799 	}
1800 	/*
1801 	 * If we were waiting for sends to complete before re-sending,
1802 	 * and they are now complete, restart sending.
1803 	 */
1804 	trace_hfi1_sendcomplete(qp, psn);
1805 	if (qp->s_flags & RVT_S_WAIT_PSN &&
1806 	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1807 		qp->s_flags &= ~RVT_S_WAIT_PSN;
1808 		qp->s_sending_psn = qp->s_psn;
1809 		qp->s_sending_hpsn = qp->s_psn - 1;
1810 		hfi1_schedule_send(qp);
1811 	}
1812 }
1813 
update_last_psn(struct rvt_qp * qp,u32 psn)1814 static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
1815 {
1816 	qp->s_last_psn = psn;
1817 }
1818 
1819 /*
1820  * Generate a SWQE completion.
1821  * This is similar to hfi1_send_complete but has to check to be sure
1822  * that the SGEs are not being referenced if the SWQE is being resent.
1823  */
do_rc_completion(struct rvt_qp * qp,struct rvt_swqe * wqe,struct hfi1_ibport * ibp)1824 struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
1825 				  struct rvt_swqe *wqe,
1826 				  struct hfi1_ibport *ibp)
1827 {
1828 	struct hfi1_qp_priv *priv = qp->priv;
1829 
1830 	lockdep_assert_held(&qp->s_lock);
1831 	/*
1832 	 * Don't decrement refcount and don't generate a
1833 	 * completion if the SWQE is being resent until the send
1834 	 * is finished.
1835 	 */
1836 	trace_hfi1_rc_completion(qp, wqe->lpsn);
1837 	if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
1838 	    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1839 		trdma_clean_swqe(qp, wqe);
1840 		trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
1841 		rvt_qp_complete_swqe(qp,
1842 				     wqe,
1843 				     ib_hfi1_wc_opcode[wqe->wr.opcode],
1844 				     IB_WC_SUCCESS);
1845 	} else {
1846 		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1847 
1848 		this_cpu_inc(*ibp->rvp.rc_delayed_comp);
1849 		/*
1850 		 * If send progress not running attempt to progress
1851 		 * SDMA queue.
1852 		 */
1853 		if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
1854 			struct sdma_engine *engine;
1855 			u8 sl = rdma_ah_get_sl(&qp->remote_ah_attr);
1856 			u8 sc5;
1857 
1858 			/* For now use sc to find engine */
1859 			sc5 = ibp->sl_to_sc[sl];
1860 			engine = qp_to_sdma_engine(qp, sc5);
1861 			sdma_engine_progress_schedule(engine);
1862 		}
1863 	}
1864 
1865 	qp->s_retry = qp->s_retry_cnt;
1866 	/*
1867 	 * Don't update the last PSN if the request being completed is
1868 	 * a TID RDMA WRITE request.
1869 	 * Completion of the TID RDMA WRITE requests are done by the
1870 	 * TID RDMA ACKs and as such could be for a request that has
1871 	 * already been ACKed as far as the IB state machine is
1872 	 * concerned.
1873 	 */
1874 	if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
1875 		update_last_psn(qp, wqe->lpsn);
1876 
1877 	/*
1878 	 * If we are completing a request which is in the process of
1879 	 * being resent, we can stop re-sending it since we know the
1880 	 * responder has already seen it.
1881 	 */
1882 	if (qp->s_acked == qp->s_cur) {
1883 		if (++qp->s_cur >= qp->s_size)
1884 			qp->s_cur = 0;
1885 		qp->s_acked = qp->s_cur;
1886 		wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
1887 		if (qp->s_acked != qp->s_tail) {
1888 			qp->s_state = OP(SEND_LAST);
1889 			qp->s_psn = wqe->psn;
1890 		}
1891 	} else {
1892 		if (++qp->s_acked >= qp->s_size)
1893 			qp->s_acked = 0;
1894 		if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
1895 			qp->s_draining = 0;
1896 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1897 	}
1898 	if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) {
1899 		priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
1900 		hfi1_schedule_send(qp);
1901 	}
1902 	return wqe;
1903 }
1904 
set_restart_qp(struct rvt_qp * qp,struct hfi1_ctxtdata * rcd)1905 static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
1906 {
1907 	/* Retry this request. */
1908 	if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
1909 		qp->r_flags |= RVT_R_RDMAR_SEQ;
1910 		hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
1911 		if (list_empty(&qp->rspwait)) {
1912 			qp->r_flags |= RVT_R_RSP_SEND;
1913 			rvt_get_qp(qp);
1914 			list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1915 		}
1916 	}
1917 }
1918 
1919 /**
1920  * update_qp_retry_state - Update qp retry state.
1921  * @qp: the QP
1922  * @psn: the packet sequence number of the TID RDMA WRITE RESP.
1923  * @spsn:  The start psn for the given TID RDMA WRITE swqe.
1924  * @lpsn:  The last psn for the given TID RDMA WRITE swqe.
1925  *
1926  * This function is called to update the qp retry state upon
1927  * receiving a TID WRITE RESP after the qp is scheduled to retry
1928  * a request.
1929  */
update_qp_retry_state(struct rvt_qp * qp,u32 psn,u32 spsn,u32 lpsn)1930 static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
1931 				  u32 lpsn)
1932 {
1933 	struct hfi1_qp_priv *qpriv = qp->priv;
1934 
1935 	qp->s_psn = psn + 1;
1936 	/*
1937 	 * If this is the first TID RDMA WRITE RESP packet for the current
1938 	 * request, change the s_state so that the retry will be processed
1939 	 * correctly. Similarly, if this is the last TID RDMA WRITE RESP
1940 	 * packet, change the s_state and advance the s_cur.
1941 	 */
1942 	if (cmp_psn(psn, lpsn) >= 0) {
1943 		qp->s_cur = qpriv->s_tid_cur + 1;
1944 		if (qp->s_cur >= qp->s_size)
1945 			qp->s_cur = 0;
1946 		qp->s_state = TID_OP(WRITE_REQ);
1947 	} else  if (!cmp_psn(psn, spsn)) {
1948 		qp->s_cur = qpriv->s_tid_cur;
1949 		qp->s_state = TID_OP(WRITE_RESP);
1950 	}
1951 }
1952 
1953 /*
1954  * do_rc_ack - process an incoming RC ACK
1955  * @qp: the QP the ACK came in on
1956  * @psn: the packet sequence number of the ACK
1957  * @opcode: the opcode of the request that resulted in the ACK
1958  *
1959  * This is called from rc_rcv_resp() to process an incoming RC ACK
1960  * for the given QP.
1961  * May be called at interrupt level, with the QP s_lock held.
1962  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1963  */
do_rc_ack(struct rvt_qp * qp,u32 aeth,u32 psn,int opcode,u64 val,struct hfi1_ctxtdata * rcd)1964 int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
1965 	      u64 val, struct hfi1_ctxtdata *rcd)
1966 {
1967 	struct hfi1_ibport *ibp;
1968 	enum ib_wc_status status;
1969 	struct hfi1_qp_priv *qpriv = qp->priv;
1970 	struct rvt_swqe *wqe;
1971 	int ret = 0;
1972 	u32 ack_psn;
1973 	int diff;
1974 	struct rvt_dev_info *rdi;
1975 
1976 	lockdep_assert_held(&qp->s_lock);
1977 	/*
1978 	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1979 	 * requests and implicitly NAK RDMA read and atomic requests issued
1980 	 * before the NAK'ed request.  The MSN won't include the NAK'ed
1981 	 * request but will include an ACK'ed request(s).
1982 	 */
1983 	ack_psn = psn;
1984 	if (aeth >> IB_AETH_NAK_SHIFT)
1985 		ack_psn--;
1986 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1987 	ibp = rcd_to_iport(rcd);
1988 
1989 	/*
1990 	 * The MSN might be for a later WQE than the PSN indicates so
1991 	 * only complete WQEs that the PSN finishes.
1992 	 */
1993 	while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
1994 		/*
1995 		 * RDMA_READ_RESPONSE_ONLY is a special case since
1996 		 * we want to generate completion events for everything
1997 		 * before the RDMA read, copy the data, then generate
1998 		 * the completion for the read.
1999 		 */
2000 		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
2001 		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
2002 		    diff == 0) {
2003 			ret = 1;
2004 			goto bail_stop;
2005 		}
2006 		/*
2007 		 * If this request is a RDMA read or atomic, and the ACK is
2008 		 * for a later operation, this ACK NAKs the RDMA read or
2009 		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
2010 		 * can ACK a RDMA read and likewise for atomic ops.  Note
2011 		 * that the NAK case can only happen if relaxed ordering is
2012 		 * used and requests are sent after an RDMA read or atomic
2013 		 * is sent but before the response is received.
2014 		 */
2015 		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
2016 		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
2017 		    (wqe->wr.opcode == IB_WR_TID_RDMA_READ &&
2018 		     (opcode != TID_OP(READ_RESP) || diff != 0)) ||
2019 		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2020 		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
2021 		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
2022 		    (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
2023 		     (delta_psn(psn, qp->s_last_psn) != 1))) {
2024 			set_restart_qp(qp, rcd);
2025 			/*
2026 			 * No need to process the ACK/NAK since we are
2027 			 * restarting an earlier request.
2028 			 */
2029 			goto bail_stop;
2030 		}
2031 		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2032 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
2033 			u64 *vaddr = wqe->sg_list[0].vaddr;
2034 			*vaddr = val;
2035 		}
2036 		if (wqe->wr.opcode == IB_WR_OPFN)
2037 			opfn_conn_reply(qp, val);
2038 
2039 		if (qp->s_num_rd_atomic &&
2040 		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
2041 		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2042 		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
2043 			qp->s_num_rd_atomic--;
2044 			/* Restart sending task if fence is complete */
2045 			if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
2046 			    !qp->s_num_rd_atomic) {
2047 				qp->s_flags &= ~(RVT_S_WAIT_FENCE |
2048 						 RVT_S_WAIT_ACK);
2049 				hfi1_schedule_send(qp);
2050 			} else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
2051 				qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
2052 						 RVT_S_WAIT_ACK);
2053 				hfi1_schedule_send(qp);
2054 			}
2055 		}
2056 
2057 		/*
2058 		 * TID RDMA WRITE requests will be completed by the TID RDMA
2059 		 * ACK packet handler (see tid_rdma.c).
2060 		 */
2061 		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
2062 			break;
2063 
2064 		wqe = do_rc_completion(qp, wqe, ibp);
2065 		if (qp->s_acked == qp->s_tail)
2066 			break;
2067 	}
2068 
2069 	trace_hfi1_rc_ack_do(qp, aeth, psn, wqe);
2070 	trace_hfi1_sender_do_rc_ack(qp);
2071 	switch (aeth >> IB_AETH_NAK_SHIFT) {
2072 	case 0:         /* ACK */
2073 		this_cpu_inc(*ibp->rvp.rc_acks);
2074 		if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2075 			if (wqe_to_tid_req(wqe)->ack_pending)
2076 				rvt_mod_retry_timer_ext(qp,
2077 							qpriv->timeout_shift);
2078 			else
2079 				rvt_stop_rc_timers(qp);
2080 		} else if (qp->s_acked != qp->s_tail) {
2081 			struct rvt_swqe *__w = NULL;
2082 
2083 			if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
2084 				__w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
2085 
2086 			/*
2087 			 * Stop timers if we've received all of the TID RDMA
2088 			 * WRITE * responses.
2089 			 */
2090 			if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
2091 			    opcode == TID_OP(WRITE_RESP)) {
2092 				/*
2093 				 * Normally, the loop above would correctly
2094 				 * process all WQEs from s_acked onward and
2095 				 * either complete them or check for correct
2096 				 * PSN sequencing.
2097 				 * However, for TID RDMA, due to pipelining,
2098 				 * the response may not be for the request at
2099 				 * s_acked so the above look would just be
2100 				 * skipped. This does not allow for checking
2101 				 * the PSN sequencing. It has to be done
2102 				 * separately.
2103 				 */
2104 				if (cmp_psn(psn, qp->s_last_psn + 1)) {
2105 					set_restart_qp(qp, rcd);
2106 					goto bail_stop;
2107 				}
2108 				/*
2109 				 * If the psn is being resent, stop the
2110 				 * resending.
2111 				 */
2112 				if (qp->s_cur != qp->s_tail &&
2113 				    cmp_psn(qp->s_psn, psn) <= 0)
2114 					update_qp_retry_state(qp, psn,
2115 							      __w->psn,
2116 							      __w->lpsn);
2117 				else if (--qpriv->pending_tid_w_resp)
2118 					rvt_mod_retry_timer(qp);
2119 				else
2120 					rvt_stop_rc_timers(qp);
2121 			} else {
2122 				/*
2123 				 * We are expecting more ACKs so
2124 				 * mod the retry timer.
2125 				 */
2126 				rvt_mod_retry_timer(qp);
2127 				/*
2128 				 * We can stop re-sending the earlier packets
2129 				 * and continue with the next packet the
2130 				 * receiver wants.
2131 				 */
2132 				if (cmp_psn(qp->s_psn, psn) <= 0)
2133 					reset_psn(qp, psn + 1);
2134 			}
2135 		} else {
2136 			/* No more acks - kill all timers */
2137 			rvt_stop_rc_timers(qp);
2138 			if (cmp_psn(qp->s_psn, psn) <= 0) {
2139 				qp->s_state = OP(SEND_LAST);
2140 				qp->s_psn = psn + 1;
2141 			}
2142 		}
2143 		if (qp->s_flags & RVT_S_WAIT_ACK) {
2144 			qp->s_flags &= ~RVT_S_WAIT_ACK;
2145 			hfi1_schedule_send(qp);
2146 		}
2147 		rvt_get_credit(qp, aeth);
2148 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
2149 		qp->s_retry = qp->s_retry_cnt;
2150 		/*
2151 		 * If the current request is a TID RDMA WRITE request and the
2152 		 * response is not a TID RDMA WRITE RESP packet, s_last_psn
2153 		 * can't be advanced.
2154 		 */
2155 		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
2156 		    opcode != TID_OP(WRITE_RESP) &&
2157 		    cmp_psn(psn, wqe->psn) >= 0)
2158 			return 1;
2159 		update_last_psn(qp, psn);
2160 		return 1;
2161 
2162 	case 1:         /* RNR NAK */
2163 		ibp->rvp.n_rnr_naks++;
2164 		if (qp->s_acked == qp->s_tail)
2165 			goto bail_stop;
2166 		if (qp->s_flags & RVT_S_WAIT_RNR)
2167 			goto bail_stop;
2168 		rdi = ib_to_rvt(qp->ibqp.device);
2169 		if (!(rdi->post_parms[wqe->wr.opcode].flags &
2170 		       RVT_OPERATION_IGN_RNR_CNT)) {
2171 			if (qp->s_rnr_retry == 0) {
2172 				status = IB_WC_RNR_RETRY_EXC_ERR;
2173 				goto class_b;
2174 			}
2175 			if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
2176 				qp->s_rnr_retry--;
2177 		}
2178 
2179 		/*
2180 		 * The last valid PSN is the previous PSN. For TID RDMA WRITE
2181 		 * request, s_last_psn should be incremented only when a TID
2182 		 * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
2183 		 * WRITE RESP packets.
2184 		 */
2185 		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
2186 			reset_psn(qp, qp->s_last_psn + 1);
2187 		} else {
2188 			update_last_psn(qp, psn - 1);
2189 			reset_psn(qp, psn);
2190 		}
2191 
2192 		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
2193 		qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
2194 		rvt_stop_rc_timers(qp);
2195 		rvt_add_rnr_timer(qp, aeth);
2196 		return 0;
2197 
2198 	case 3:         /* NAK */
2199 		if (qp->s_acked == qp->s_tail)
2200 			goto bail_stop;
2201 		/* The last valid PSN is the previous PSN. */
2202 		update_last_psn(qp, psn - 1);
2203 		switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
2204 			IB_AETH_CREDIT_MASK) {
2205 		case 0: /* PSN sequence error */
2206 			ibp->rvp.n_seq_naks++;
2207 			/*
2208 			 * Back up to the responder's expected PSN.
2209 			 * Note that we might get a NAK in the middle of an
2210 			 * RDMA READ response which terminates the RDMA
2211 			 * READ.
2212 			 */
2213 			hfi1_restart_rc(qp, psn, 0);
2214 			hfi1_schedule_send(qp);
2215 			break;
2216 
2217 		case 1: /* Invalid Request */
2218 			status = IB_WC_REM_INV_REQ_ERR;
2219 			ibp->rvp.n_other_naks++;
2220 			goto class_b;
2221 
2222 		case 2: /* Remote Access Error */
2223 			status = IB_WC_REM_ACCESS_ERR;
2224 			ibp->rvp.n_other_naks++;
2225 			goto class_b;
2226 
2227 		case 3: /* Remote Operation Error */
2228 			status = IB_WC_REM_OP_ERR;
2229 			ibp->rvp.n_other_naks++;
2230 class_b:
2231 			if (qp->s_last == qp->s_acked) {
2232 				if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
2233 					hfi1_kern_read_tid_flow_free(qp);
2234 
2235 				hfi1_trdma_send_complete(qp, wqe, status);
2236 				rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
2237 			}
2238 			break;
2239 
2240 		default:
2241 			/* Ignore other reserved NAK error codes */
2242 			goto reserved;
2243 		}
2244 		qp->s_retry = qp->s_retry_cnt;
2245 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
2246 		goto bail_stop;
2247 
2248 	default:                /* 2: reserved */
2249 reserved:
2250 		/* Ignore reserved NAK codes. */
2251 		goto bail_stop;
2252 	}
2253 	/* cannot be reached  */
2254 bail_stop:
2255 	rvt_stop_rc_timers(qp);
2256 	return ret;
2257 }
2258 
2259 /*
2260  * We have seen an out of sequence RDMA read middle or last packet.
2261  * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
2262  */
rdma_seq_err(struct rvt_qp * qp,struct hfi1_ibport * ibp,u32 psn,struct hfi1_ctxtdata * rcd)2263 static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
2264 			 struct hfi1_ctxtdata *rcd)
2265 {
2266 	struct rvt_swqe *wqe;
2267 
2268 	lockdep_assert_held(&qp->s_lock);
2269 	/* Remove QP from retry timer */
2270 	rvt_stop_rc_timers(qp);
2271 
2272 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2273 
2274 	while (cmp_psn(psn, wqe->lpsn) > 0) {
2275 		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
2276 		    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
2277 		    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
2278 		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2279 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
2280 			break;
2281 		wqe = do_rc_completion(qp, wqe, ibp);
2282 	}
2283 
2284 	ibp->rvp.n_rdma_seq++;
2285 	qp->r_flags |= RVT_R_RDMAR_SEQ;
2286 	hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
2287 	if (list_empty(&qp->rspwait)) {
2288 		qp->r_flags |= RVT_R_RSP_SEND;
2289 		rvt_get_qp(qp);
2290 		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2291 	}
2292 }
2293 
2294 /**
2295  * rc_rcv_resp - process an incoming RC response packet
2296  * @packet: data packet information
2297  *
2298  * This is called from hfi1_rc_rcv() to process an incoming RC response
2299  * packet for the given QP.
2300  * Called at interrupt level.
2301  */
rc_rcv_resp(struct hfi1_packet * packet)2302 static void rc_rcv_resp(struct hfi1_packet *packet)
2303 {
2304 	struct hfi1_ctxtdata *rcd = packet->rcd;
2305 	void *data = packet->payload;
2306 	u32 tlen = packet->tlen;
2307 	struct rvt_qp *qp = packet->qp;
2308 	struct hfi1_ibport *ibp;
2309 	struct ib_other_headers *ohdr = packet->ohdr;
2310 	struct rvt_swqe *wqe;
2311 	enum ib_wc_status status;
2312 	unsigned long flags;
2313 	int diff;
2314 	u64 val;
2315 	u32 aeth;
2316 	u32 psn = ib_bth_get_psn(packet->ohdr);
2317 	u32 pmtu = qp->pmtu;
2318 	u16 hdrsize = packet->hlen;
2319 	u8 opcode = packet->opcode;
2320 	u8 pad = packet->pad;
2321 	u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
2322 
2323 	spin_lock_irqsave(&qp->s_lock, flags);
2324 	trace_hfi1_ack(qp, psn);
2325 
2326 	/* Ignore invalid responses. */
2327 	if (cmp_psn(psn, READ_ONCE(qp->s_next_psn)) >= 0)
2328 		goto ack_done;
2329 
2330 	/* Ignore duplicate responses. */
2331 	diff = cmp_psn(psn, qp->s_last_psn);
2332 	if (unlikely(diff <= 0)) {
2333 		/* Update credits for "ghost" ACKs */
2334 		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
2335 			aeth = be32_to_cpu(ohdr->u.aeth);
2336 			if ((aeth >> IB_AETH_NAK_SHIFT) == 0)
2337 				rvt_get_credit(qp, aeth);
2338 		}
2339 		goto ack_done;
2340 	}
2341 
2342 	/*
2343 	 * Skip everything other than the PSN we expect, if we are waiting
2344 	 * for a reply to a restarted RDMA read or atomic op.
2345 	 */
2346 	if (qp->r_flags & RVT_R_RDMAR_SEQ) {
2347 		if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
2348 			goto ack_done;
2349 		qp->r_flags &= ~RVT_R_RDMAR_SEQ;
2350 	}
2351 
2352 	if (unlikely(qp->s_acked == qp->s_tail))
2353 		goto ack_done;
2354 	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2355 	status = IB_WC_SUCCESS;
2356 
2357 	switch (opcode) {
2358 	case OP(ACKNOWLEDGE):
2359 	case OP(ATOMIC_ACKNOWLEDGE):
2360 	case OP(RDMA_READ_RESPONSE_FIRST):
2361 		aeth = be32_to_cpu(ohdr->u.aeth);
2362 		if (opcode == OP(ATOMIC_ACKNOWLEDGE))
2363 			val = ib_u64_get(&ohdr->u.at.atomic_ack_eth);
2364 		else
2365 			val = 0;
2366 		if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
2367 		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
2368 			goto ack_done;
2369 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2370 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
2371 			goto ack_op_err;
2372 		/*
2373 		 * If this is a response to a resent RDMA read, we
2374 		 * have to be careful to copy the data to the right
2375 		 * location.
2376 		 */
2377 		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
2378 						  wqe, psn, pmtu);
2379 		goto read_middle;
2380 
2381 	case OP(RDMA_READ_RESPONSE_MIDDLE):
2382 		/* no AETH, no ACK */
2383 		if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
2384 			goto ack_seq_err;
2385 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
2386 			goto ack_op_err;
2387 read_middle:
2388 		if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
2389 			goto ack_len_err;
2390 		if (unlikely(pmtu >= qp->s_rdma_read_len))
2391 			goto ack_len_err;
2392 
2393 		/*
2394 		 * We got a response so update the timeout.
2395 		 * 4.096 usec. * (1 << qp->timeout)
2396 		 */
2397 		rvt_mod_retry_timer(qp);
2398 		if (qp->s_flags & RVT_S_WAIT_ACK) {
2399 			qp->s_flags &= ~RVT_S_WAIT_ACK;
2400 			hfi1_schedule_send(qp);
2401 		}
2402 
2403 		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
2404 			qp->s_retry = qp->s_retry_cnt;
2405 
2406 		/*
2407 		 * Update the RDMA receive state but do the copy w/o
2408 		 * holding the locks and blocking interrupts.
2409 		 */
2410 		qp->s_rdma_read_len -= pmtu;
2411 		update_last_psn(qp, psn);
2412 		spin_unlock_irqrestore(&qp->s_lock, flags);
2413 		rvt_copy_sge(qp, &qp->s_rdma_read_sge,
2414 			     data, pmtu, false, false);
2415 		goto bail;
2416 
2417 	case OP(RDMA_READ_RESPONSE_ONLY):
2418 		aeth = be32_to_cpu(ohdr->u.aeth);
2419 		if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
2420 			goto ack_done;
2421 		/*
2422 		 * Check that the data size is >= 0 && <= pmtu.
2423 		 * Remember to account for ICRC (4).
2424 		 */
2425 		if (unlikely(tlen < (hdrsize + extra_bytes)))
2426 			goto ack_len_err;
2427 		/*
2428 		 * If this is a response to a resent RDMA read, we
2429 		 * have to be careful to copy the data to the right
2430 		 * location.
2431 		 */
2432 		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2433 		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
2434 						  wqe, psn, pmtu);
2435 		goto read_last;
2436 
2437 	case OP(RDMA_READ_RESPONSE_LAST):
2438 		/* ACKs READ req. */
2439 		if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
2440 			goto ack_seq_err;
2441 		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
2442 			goto ack_op_err;
2443 		/*
2444 		 * Check that the data size is >= 1 && <= pmtu.
2445 		 * Remember to account for ICRC (4).
2446 		 */
2447 		if (unlikely(tlen <= (hdrsize + extra_bytes)))
2448 			goto ack_len_err;
2449 read_last:
2450 		tlen -= hdrsize + extra_bytes;
2451 		if (unlikely(tlen != qp->s_rdma_read_len))
2452 			goto ack_len_err;
2453 		aeth = be32_to_cpu(ohdr->u.aeth);
2454 		rvt_copy_sge(qp, &qp->s_rdma_read_sge,
2455 			     data, tlen, false, false);
2456 		WARN_ON(qp->s_rdma_read_sge.num_sge);
2457 		(void)do_rc_ack(qp, aeth, psn,
2458 				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
2459 		goto ack_done;
2460 	}
2461 
2462 ack_op_err:
2463 	status = IB_WC_LOC_QP_OP_ERR;
2464 	goto ack_err;
2465 
2466 ack_seq_err:
2467 	ibp = rcd_to_iport(rcd);
2468 	rdma_seq_err(qp, ibp, psn, rcd);
2469 	goto ack_done;
2470 
2471 ack_len_err:
2472 	status = IB_WC_LOC_LEN_ERR;
2473 ack_err:
2474 	if (qp->s_last == qp->s_acked) {
2475 		rvt_send_complete(qp, wqe, status);
2476 		rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
2477 	}
2478 ack_done:
2479 	spin_unlock_irqrestore(&qp->s_lock, flags);
2480 bail:
2481 	return;
2482 }
2483 
rc_cancel_ack(struct rvt_qp * qp)2484 static inline void rc_cancel_ack(struct rvt_qp *qp)
2485 {
2486 	qp->r_adefered = 0;
2487 	if (list_empty(&qp->rspwait))
2488 		return;
2489 	list_del_init(&qp->rspwait);
2490 	qp->r_flags &= ~RVT_R_RSP_NAK;
2491 	rvt_put_qp(qp);
2492 }
2493 
2494 /**
2495  * rc_rcv_error - process an incoming duplicate or error RC packet
2496  * @ohdr: the other headers for this packet
2497  * @data: the packet data
2498  * @qp: the QP for this packet
2499  * @opcode: the opcode for this packet
2500  * @psn: the packet sequence number for this packet
2501  * @diff: the difference between the PSN and the expected PSN
2502  * @rcd: the receive context
2503  *
2504  * This is called from hfi1_rc_rcv() to process an unexpected
2505  * incoming RC packet for the given QP.
2506  * Called at interrupt level.
2507  * Return 1 if no more processing is needed; otherwise return 0 to
2508  * schedule a response to be sent.
2509  */
rc_rcv_error(struct ib_other_headers * ohdr,void * data,struct rvt_qp * qp,u32 opcode,u32 psn,int diff,struct hfi1_ctxtdata * rcd)2510 static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
2511 				 struct rvt_qp *qp, u32 opcode, u32 psn,
2512 				 int diff, struct hfi1_ctxtdata *rcd)
2513 {
2514 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
2515 	struct rvt_ack_entry *e;
2516 	unsigned long flags;
2517 	u8 prev;
2518 	u8 mra; /* most recent ACK */
2519 	bool old_req;
2520 
2521 	trace_hfi1_rcv_error(qp, psn);
2522 	if (diff > 0) {
2523 		/*
2524 		 * Packet sequence error.
2525 		 * A NAK will ACK earlier sends and RDMA writes.
2526 		 * Don't queue the NAK if we already sent one.
2527 		 */
2528 		if (!qp->r_nak_state) {
2529 			ibp->rvp.n_rc_seqnak++;
2530 			qp->r_nak_state = IB_NAK_PSN_ERROR;
2531 			/* Use the expected PSN. */
2532 			qp->r_ack_psn = qp->r_psn;
2533 			/*
2534 			 * Wait to send the sequence NAK until all packets
2535 			 * in the receive queue have been processed.
2536 			 * Otherwise, we end up propagating congestion.
2537 			 */
2538 			rc_defered_ack(rcd, qp);
2539 		}
2540 		goto done;
2541 	}
2542 
2543 	/*
2544 	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
2545 	 * write or atomic op.  Don't NAK errors, just silently drop
2546 	 * the duplicate request.  Note that r_sge, r_len, and
2547 	 * r_rcv_len may be in use so don't modify them.
2548 	 *
2549 	 * We are supposed to ACK the earliest duplicate PSN but we
2550 	 * can coalesce an outstanding duplicate ACK.  We have to
2551 	 * send the earliest so that RDMA reads can be restarted at
2552 	 * the requester's expected PSN.
2553 	 *
2554 	 * First, find where this duplicate PSN falls within the
2555 	 * ACKs previously sent.
2556 	 * old_req is true if there is an older response that is scheduled
2557 	 * to be sent before sending this one.
2558 	 */
2559 	e = NULL;
2560 	old_req = true;
2561 	ibp->rvp.n_rc_dupreq++;
2562 
2563 	spin_lock_irqsave(&qp->s_lock, flags);
2564 
2565 	e = find_prev_entry(qp, psn, &prev, &mra, &old_req);
2566 
2567 	switch (opcode) {
2568 	case OP(RDMA_READ_REQUEST): {
2569 		struct ib_reth *reth;
2570 		u32 offset;
2571 		u32 len;
2572 
2573 		/*
2574 		 * If we didn't find the RDMA read request in the ack queue,
2575 		 * we can ignore this request.
2576 		 */
2577 		if (!e || e->opcode != OP(RDMA_READ_REQUEST))
2578 			goto unlock_done;
2579 		/* RETH comes after BTH */
2580 		reth = &ohdr->u.rc.reth;
2581 		/*
2582 		 * Address range must be a subset of the original
2583 		 * request and start on pmtu boundaries.
2584 		 * We reuse the old ack_queue slot since the requester
2585 		 * should not back up and request an earlier PSN for the
2586 		 * same request.
2587 		 */
2588 		offset = delta_psn(psn, e->psn) * qp->pmtu;
2589 		len = be32_to_cpu(reth->length);
2590 		if (unlikely(offset + len != e->rdma_sge.sge_length))
2591 			goto unlock_done;
2592 		release_rdma_sge_mr(e);
2593 		if (len != 0) {
2594 			u32 rkey = be32_to_cpu(reth->rkey);
2595 			u64 vaddr = get_ib_reth_vaddr(reth);
2596 			int ok;
2597 
2598 			ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
2599 					 IB_ACCESS_REMOTE_READ);
2600 			if (unlikely(!ok))
2601 				goto unlock_done;
2602 		} else {
2603 			e->rdma_sge.vaddr = NULL;
2604 			e->rdma_sge.length = 0;
2605 			e->rdma_sge.sge_length = 0;
2606 		}
2607 		e->psn = psn;
2608 		if (old_req)
2609 			goto unlock_done;
2610 		if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
2611 			qp->s_acked_ack_queue = prev;
2612 		qp->s_tail_ack_queue = prev;
2613 		break;
2614 	}
2615 
2616 	case OP(COMPARE_SWAP):
2617 	case OP(FETCH_ADD): {
2618 		/*
2619 		 * If we didn't find the atomic request in the ack queue
2620 		 * or the send engine is already backed up to send an
2621 		 * earlier entry, we can ignore this request.
2622 		 */
2623 		if (!e || e->opcode != (u8)opcode || old_req)
2624 			goto unlock_done;
2625 		if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
2626 			qp->s_acked_ack_queue = prev;
2627 		qp->s_tail_ack_queue = prev;
2628 		break;
2629 	}
2630 
2631 	default:
2632 		/*
2633 		 * Ignore this operation if it doesn't request an ACK
2634 		 * or an earlier RDMA read or atomic is going to be resent.
2635 		 */
2636 		if (!(psn & IB_BTH_REQ_ACK) || old_req)
2637 			goto unlock_done;
2638 		/*
2639 		 * Resend the most recent ACK if this request is
2640 		 * after all the previous RDMA reads and atomics.
2641 		 */
2642 		if (mra == qp->r_head_ack_queue) {
2643 			spin_unlock_irqrestore(&qp->s_lock, flags);
2644 			qp->r_nak_state = 0;
2645 			qp->r_ack_psn = qp->r_psn - 1;
2646 			goto send_ack;
2647 		}
2648 
2649 		/*
2650 		 * Resend the RDMA read or atomic op which
2651 		 * ACKs this duplicate request.
2652 		 */
2653 		if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
2654 			qp->s_acked_ack_queue = mra;
2655 		qp->s_tail_ack_queue = mra;
2656 		break;
2657 	}
2658 	qp->s_ack_state = OP(ACKNOWLEDGE);
2659 	qp->s_flags |= RVT_S_RESP_PENDING;
2660 	qp->r_nak_state = 0;
2661 	hfi1_schedule_send(qp);
2662 
2663 unlock_done:
2664 	spin_unlock_irqrestore(&qp->s_lock, flags);
2665 done:
2666 	return 1;
2667 
2668 send_ack:
2669 	return 0;
2670 }
2671 
log_cca_event(struct hfi1_pportdata * ppd,u8 sl,u32 rlid,u32 lqpn,u32 rqpn,u8 svc_type)2672 static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
2673 			  u32 lqpn, u32 rqpn, u8 svc_type)
2674 {
2675 	struct opa_hfi1_cong_log_event_internal *cc_event;
2676 	unsigned long flags;
2677 
2678 	if (sl >= OPA_MAX_SLS)
2679 		return;
2680 
2681 	spin_lock_irqsave(&ppd->cc_log_lock, flags);
2682 
2683 	ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
2684 	ppd->threshold_event_counter++;
2685 
2686 	cc_event = &ppd->cc_events[ppd->cc_log_idx++];
2687 	if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
2688 		ppd->cc_log_idx = 0;
2689 	cc_event->lqpn = lqpn & RVT_QPN_MASK;
2690 	cc_event->rqpn = rqpn & RVT_QPN_MASK;
2691 	cc_event->sl = sl;
2692 	cc_event->svc_type = svc_type;
2693 	cc_event->rlid = rlid;
2694 	/* keep timestamp in units of 1.024 usec */
2695 	cc_event->timestamp = ktime_get_ns() / 1024;
2696 
2697 	spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
2698 }
2699 
process_becn(struct hfi1_pportdata * ppd,u8 sl,u32 rlid,u32 lqpn,u32 rqpn,u8 svc_type)2700 void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn,
2701 		  u32 rqpn, u8 svc_type)
2702 {
2703 	struct cca_timer *cca_timer;
2704 	u16 ccti, ccti_incr, ccti_timer, ccti_limit;
2705 	u8 trigger_threshold;
2706 	struct cc_state *cc_state;
2707 	unsigned long flags;
2708 
2709 	if (sl >= OPA_MAX_SLS)
2710 		return;
2711 
2712 	cc_state = get_cc_state(ppd);
2713 
2714 	if (!cc_state)
2715 		return;
2716 
2717 	/*
2718 	 * 1) increase CCTI (for this SL)
2719 	 * 2) select IPG (i.e., call set_link_ipg())
2720 	 * 3) start timer
2721 	 */
2722 	ccti_limit = cc_state->cct.ccti_limit;
2723 	ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
2724 	ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
2725 	trigger_threshold =
2726 		cc_state->cong_setting.entries[sl].trigger_threshold;
2727 
2728 	spin_lock_irqsave(&ppd->cca_timer_lock, flags);
2729 
2730 	cca_timer = &ppd->cca_timer[sl];
2731 	if (cca_timer->ccti < ccti_limit) {
2732 		if (cca_timer->ccti + ccti_incr <= ccti_limit)
2733 			cca_timer->ccti += ccti_incr;
2734 		else
2735 			cca_timer->ccti = ccti_limit;
2736 		set_link_ipg(ppd);
2737 	}
2738 
2739 	ccti = cca_timer->ccti;
2740 
2741 	if (!hrtimer_active(&cca_timer->hrtimer)) {
2742 		/* ccti_timer is in units of 1.024 usec */
2743 		unsigned long nsec = 1024 * ccti_timer;
2744 
2745 		hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
2746 			      HRTIMER_MODE_REL_PINNED);
2747 	}
2748 
2749 	spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
2750 
2751 	if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
2752 		log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
2753 }
2754 
2755 /**
2756  * hfi1_rc_rcv - process an incoming RC packet
2757  * @packet: data packet information
2758  *
2759  * This is called from qp_rcv() to process an incoming RC packet
2760  * for the given QP.
2761  * May be called at interrupt level.
2762  */
hfi1_rc_rcv(struct hfi1_packet * packet)2763 void hfi1_rc_rcv(struct hfi1_packet *packet)
2764 {
2765 	struct hfi1_ctxtdata *rcd = packet->rcd;
2766 	void *data = packet->payload;
2767 	u32 tlen = packet->tlen;
2768 	struct rvt_qp *qp = packet->qp;
2769 	struct hfi1_qp_priv *qpriv = qp->priv;
2770 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
2771 	struct ib_other_headers *ohdr = packet->ohdr;
2772 	u32 opcode = packet->opcode;
2773 	u32 hdrsize = packet->hlen;
2774 	u32 psn = ib_bth_get_psn(packet->ohdr);
2775 	u32 pad = packet->pad;
2776 	struct ib_wc wc;
2777 	u32 pmtu = qp->pmtu;
2778 	int diff;
2779 	struct ib_reth *reth;
2780 	unsigned long flags;
2781 	int ret;
2782 	bool copy_last = false, fecn;
2783 	u32 rkey;
2784 	u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
2785 
2786 	lockdep_assert_held(&qp->r_lock);
2787 
2788 	if (hfi1_ruc_check_hdr(ibp, packet))
2789 		return;
2790 
2791 	fecn = process_ecn(qp, packet);
2792 	opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1]));
2793 
2794 	/*
2795 	 * Process responses (ACKs) before anything else.  Note that the
2796 	 * packet sequence number will be for something in the send work
2797 	 * queue rather than the expected receive packet sequence number.
2798 	 * In other words, this QP is the requester.
2799 	 */
2800 	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
2801 	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
2802 		rc_rcv_resp(packet);
2803 		return;
2804 	}
2805 
2806 	/* Compute 24 bits worth of difference. */
2807 	diff = delta_psn(psn, qp->r_psn);
2808 	if (unlikely(diff)) {
2809 		if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
2810 			return;
2811 		goto send_ack;
2812 	}
2813 
2814 	/* Check for opcode sequence errors. */
2815 	switch (qp->r_state) {
2816 	case OP(SEND_FIRST):
2817 	case OP(SEND_MIDDLE):
2818 		if (opcode == OP(SEND_MIDDLE) ||
2819 		    opcode == OP(SEND_LAST) ||
2820 		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2821 		    opcode == OP(SEND_LAST_WITH_INVALIDATE))
2822 			break;
2823 		goto nack_inv;
2824 
2825 	case OP(RDMA_WRITE_FIRST):
2826 	case OP(RDMA_WRITE_MIDDLE):
2827 		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
2828 		    opcode == OP(RDMA_WRITE_LAST) ||
2829 		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2830 			break;
2831 		goto nack_inv;
2832 
2833 	default:
2834 		if (opcode == OP(SEND_MIDDLE) ||
2835 		    opcode == OP(SEND_LAST) ||
2836 		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2837 		    opcode == OP(SEND_LAST_WITH_INVALIDATE) ||
2838 		    opcode == OP(RDMA_WRITE_MIDDLE) ||
2839 		    opcode == OP(RDMA_WRITE_LAST) ||
2840 		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2841 			goto nack_inv;
2842 		/*
2843 		 * Note that it is up to the requester to not send a new
2844 		 * RDMA read or atomic operation before receiving an ACK
2845 		 * for the previous operation.
2846 		 */
2847 		break;
2848 	}
2849 
2850 	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2851 		rvt_comm_est(qp);
2852 
2853 	/* OK, process the packet. */
2854 	switch (opcode) {
2855 	case OP(SEND_FIRST):
2856 		ret = rvt_get_rwqe(qp, false);
2857 		if (ret < 0)
2858 			goto nack_op_err;
2859 		if (!ret)
2860 			goto rnr_nak;
2861 		qp->r_rcv_len = 0;
2862 		fallthrough;
2863 	case OP(SEND_MIDDLE):
2864 	case OP(RDMA_WRITE_MIDDLE):
2865 send_middle:
2866 		/* Check for invalid length PMTU or posted rwqe len. */
2867 		/*
2868 		 * There will be no padding for 9B packet but 16B packets
2869 		 * will come in with some padding since we always add
2870 		 * CRC and LT bytes which will need to be flit aligned
2871 		 */
2872 		if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
2873 			goto nack_inv;
2874 		qp->r_rcv_len += pmtu;
2875 		if (unlikely(qp->r_rcv_len > qp->r_len))
2876 			goto nack_inv;
2877 		rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
2878 		break;
2879 
2880 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
2881 		/* consume RWQE */
2882 		ret = rvt_get_rwqe(qp, true);
2883 		if (ret < 0)
2884 			goto nack_op_err;
2885 		if (!ret)
2886 			goto rnr_nak;
2887 		goto send_last_imm;
2888 
2889 	case OP(SEND_ONLY):
2890 	case OP(SEND_ONLY_WITH_IMMEDIATE):
2891 	case OP(SEND_ONLY_WITH_INVALIDATE):
2892 		ret = rvt_get_rwqe(qp, false);
2893 		if (ret < 0)
2894 			goto nack_op_err;
2895 		if (!ret)
2896 			goto rnr_nak;
2897 		qp->r_rcv_len = 0;
2898 		if (opcode == OP(SEND_ONLY))
2899 			goto no_immediate_data;
2900 		if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
2901 			goto send_last_inv;
2902 		fallthrough;	/* for SEND_ONLY_WITH_IMMEDIATE */
2903 	case OP(SEND_LAST_WITH_IMMEDIATE):
2904 send_last_imm:
2905 		wc.ex.imm_data = ohdr->u.imm_data;
2906 		wc.wc_flags = IB_WC_WITH_IMM;
2907 		goto send_last;
2908 	case OP(SEND_LAST_WITH_INVALIDATE):
2909 send_last_inv:
2910 		rkey = be32_to_cpu(ohdr->u.ieth);
2911 		if (rvt_invalidate_rkey(qp, rkey))
2912 			goto no_immediate_data;
2913 		wc.ex.invalidate_rkey = rkey;
2914 		wc.wc_flags = IB_WC_WITH_INVALIDATE;
2915 		goto send_last;
2916 	case OP(RDMA_WRITE_LAST):
2917 		copy_last = rvt_is_user_qp(qp);
2918 		fallthrough;
2919 	case OP(SEND_LAST):
2920 no_immediate_data:
2921 		wc.wc_flags = 0;
2922 		wc.ex.imm_data = 0;
2923 send_last:
2924 		/* Check for invalid length. */
2925 		/* LAST len should be >= 1 */
2926 		if (unlikely(tlen < (hdrsize + extra_bytes)))
2927 			goto nack_inv;
2928 		/* Don't count the CRC(and padding and LT byte for 16B). */
2929 		tlen -= (hdrsize + extra_bytes);
2930 		wc.byte_len = tlen + qp->r_rcv_len;
2931 		if (unlikely(wc.byte_len > qp->r_len))
2932 			goto nack_inv;
2933 		rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last);
2934 		rvt_put_ss(&qp->r_sge);
2935 		qp->r_msn++;
2936 		if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
2937 			break;
2938 		wc.wr_id = qp->r_wr_id;
2939 		wc.status = IB_WC_SUCCESS;
2940 		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
2941 		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
2942 			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
2943 		else
2944 			wc.opcode = IB_WC_RECV;
2945 		wc.qp = &qp->ibqp;
2946 		wc.src_qp = qp->remote_qpn;
2947 		wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
2948 		/*
2949 		 * It seems that IB mandates the presence of an SL in a
2950 		 * work completion only for the UD transport (see section
2951 		 * 11.4.2 of IBTA Vol. 1).
2952 		 *
2953 		 * However, the way the SL is chosen below is consistent
2954 		 * with the way that IB/qib works and is trying avoid
2955 		 * introducing incompatibilities.
2956 		 *
2957 		 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
2958 		 */
2959 		wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
2960 		/* zero fields that are N/A */
2961 		wc.vendor_err = 0;
2962 		wc.pkey_index = 0;
2963 		wc.dlid_path_bits = 0;
2964 		wc.port_num = 0;
2965 		/* Signal completion event if the solicited bit is set. */
2966 		rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
2967 		break;
2968 
2969 	case OP(RDMA_WRITE_ONLY):
2970 		copy_last = rvt_is_user_qp(qp);
2971 		fallthrough;
2972 	case OP(RDMA_WRITE_FIRST):
2973 	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
2974 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
2975 			goto nack_inv;
2976 		/* consume RWQE */
2977 		reth = &ohdr->u.rc.reth;
2978 		qp->r_len = be32_to_cpu(reth->length);
2979 		qp->r_rcv_len = 0;
2980 		qp->r_sge.sg_list = NULL;
2981 		if (qp->r_len != 0) {
2982 			u32 rkey = be32_to_cpu(reth->rkey);
2983 			u64 vaddr = get_ib_reth_vaddr(reth);
2984 			int ok;
2985 
2986 			/* Check rkey & NAK */
2987 			ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
2988 					 rkey, IB_ACCESS_REMOTE_WRITE);
2989 			if (unlikely(!ok))
2990 				goto nack_acc;
2991 			qp->r_sge.num_sge = 1;
2992 		} else {
2993 			qp->r_sge.num_sge = 0;
2994 			qp->r_sge.sge.mr = NULL;
2995 			qp->r_sge.sge.vaddr = NULL;
2996 			qp->r_sge.sge.length = 0;
2997 			qp->r_sge.sge.sge_length = 0;
2998 		}
2999 		if (opcode == OP(RDMA_WRITE_FIRST))
3000 			goto send_middle;
3001 		else if (opcode == OP(RDMA_WRITE_ONLY))
3002 			goto no_immediate_data;
3003 		ret = rvt_get_rwqe(qp, true);
3004 		if (ret < 0)
3005 			goto nack_op_err;
3006 		if (!ret) {
3007 			/* peer will send again */
3008 			rvt_put_ss(&qp->r_sge);
3009 			goto rnr_nak;
3010 		}
3011 		wc.ex.imm_data = ohdr->u.rc.imm_data;
3012 		wc.wc_flags = IB_WC_WITH_IMM;
3013 		goto send_last;
3014 
3015 	case OP(RDMA_READ_REQUEST): {
3016 		struct rvt_ack_entry *e;
3017 		u32 len;
3018 		u8 next;
3019 
3020 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
3021 			goto nack_inv;
3022 		next = qp->r_head_ack_queue + 1;
3023 		/* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */
3024 		if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3025 			next = 0;
3026 		spin_lock_irqsave(&qp->s_lock, flags);
3027 		if (unlikely(next == qp->s_acked_ack_queue)) {
3028 			if (!qp->s_ack_queue[next].sent)
3029 				goto nack_inv_unlck;
3030 			update_ack_queue(qp, next);
3031 		}
3032 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
3033 		release_rdma_sge_mr(e);
3034 		reth = &ohdr->u.rc.reth;
3035 		len = be32_to_cpu(reth->length);
3036 		if (len) {
3037 			u32 rkey = be32_to_cpu(reth->rkey);
3038 			u64 vaddr = get_ib_reth_vaddr(reth);
3039 			int ok;
3040 
3041 			/* Check rkey & NAK */
3042 			ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
3043 					 rkey, IB_ACCESS_REMOTE_READ);
3044 			if (unlikely(!ok))
3045 				goto nack_acc_unlck;
3046 			/*
3047 			 * Update the next expected PSN.  We add 1 later
3048 			 * below, so only add the remainder here.
3049 			 */
3050 			qp->r_psn += rvt_div_mtu(qp, len - 1);
3051 		} else {
3052 			e->rdma_sge.mr = NULL;
3053 			e->rdma_sge.vaddr = NULL;
3054 			e->rdma_sge.length = 0;
3055 			e->rdma_sge.sge_length = 0;
3056 		}
3057 		e->opcode = opcode;
3058 		e->sent = 0;
3059 		e->psn = psn;
3060 		e->lpsn = qp->r_psn;
3061 		/*
3062 		 * We need to increment the MSN here instead of when we
3063 		 * finish sending the result since a duplicate request would
3064 		 * increment it more than once.
3065 		 */
3066 		qp->r_msn++;
3067 		qp->r_psn++;
3068 		qp->r_state = opcode;
3069 		qp->r_nak_state = 0;
3070 		qp->r_head_ack_queue = next;
3071 		qpriv->r_tid_alloc = qp->r_head_ack_queue;
3072 
3073 		/* Schedule the send engine. */
3074 		qp->s_flags |= RVT_S_RESP_PENDING;
3075 		if (fecn)
3076 			qp->s_flags |= RVT_S_ECN;
3077 		hfi1_schedule_send(qp);
3078 
3079 		spin_unlock_irqrestore(&qp->s_lock, flags);
3080 		return;
3081 	}
3082 
3083 	case OP(COMPARE_SWAP):
3084 	case OP(FETCH_ADD): {
3085 		struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth;
3086 		u64 vaddr = get_ib_ateth_vaddr(ateth);
3087 		bool opfn = opcode == OP(COMPARE_SWAP) &&
3088 			vaddr == HFI1_VERBS_E_ATOMIC_VADDR;
3089 		struct rvt_ack_entry *e;
3090 		atomic64_t *maddr;
3091 		u64 sdata;
3092 		u32 rkey;
3093 		u8 next;
3094 
3095 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
3096 			     !opfn))
3097 			goto nack_inv;
3098 		next = qp->r_head_ack_queue + 1;
3099 		if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3100 			next = 0;
3101 		spin_lock_irqsave(&qp->s_lock, flags);
3102 		if (unlikely(next == qp->s_acked_ack_queue)) {
3103 			if (!qp->s_ack_queue[next].sent)
3104 				goto nack_inv_unlck;
3105 			update_ack_queue(qp, next);
3106 		}
3107 		e = &qp->s_ack_queue[qp->r_head_ack_queue];
3108 		release_rdma_sge_mr(e);
3109 		/* Process OPFN special virtual address */
3110 		if (opfn) {
3111 			opfn_conn_response(qp, e, ateth);
3112 			goto ack;
3113 		}
3114 		if (unlikely(vaddr & (sizeof(u64) - 1)))
3115 			goto nack_inv_unlck;
3116 		rkey = be32_to_cpu(ateth->rkey);
3117 		/* Check rkey & NAK */
3118 		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
3119 					  vaddr, rkey,
3120 					  IB_ACCESS_REMOTE_ATOMIC)))
3121 			goto nack_acc_unlck;
3122 		/* Perform atomic OP and save result. */
3123 		maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
3124 		sdata = get_ib_ateth_swap(ateth);
3125 		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
3126 			(u64)atomic64_add_return(sdata, maddr) - sdata :
3127 			(u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
3128 				      get_ib_ateth_compare(ateth),
3129 				      sdata);
3130 		rvt_put_mr(qp->r_sge.sge.mr);
3131 		qp->r_sge.num_sge = 0;
3132 ack:
3133 		e->opcode = opcode;
3134 		e->sent = 0;
3135 		e->psn = psn;
3136 		e->lpsn = psn;
3137 		qp->r_msn++;
3138 		qp->r_psn++;
3139 		qp->r_state = opcode;
3140 		qp->r_nak_state = 0;
3141 		qp->r_head_ack_queue = next;
3142 		qpriv->r_tid_alloc = qp->r_head_ack_queue;
3143 
3144 		/* Schedule the send engine. */
3145 		qp->s_flags |= RVT_S_RESP_PENDING;
3146 		if (fecn)
3147 			qp->s_flags |= RVT_S_ECN;
3148 		hfi1_schedule_send(qp);
3149 
3150 		spin_unlock_irqrestore(&qp->s_lock, flags);
3151 		return;
3152 	}
3153 
3154 	default:
3155 		/* NAK unknown opcodes. */
3156 		goto nack_inv;
3157 	}
3158 	qp->r_psn++;
3159 	qp->r_state = opcode;
3160 	qp->r_ack_psn = psn;
3161 	qp->r_nak_state = 0;
3162 	/* Send an ACK if requested or required. */
3163 	if (psn & IB_BTH_REQ_ACK || fecn) {
3164 		if (packet->numpkt == 0 || fecn ||
3165 		    qp->r_adefered >= HFI1_PSN_CREDIT) {
3166 			rc_cancel_ack(qp);
3167 			goto send_ack;
3168 		}
3169 		qp->r_adefered++;
3170 		rc_defered_ack(rcd, qp);
3171 	}
3172 	return;
3173 
3174 rnr_nak:
3175 	qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
3176 	qp->r_ack_psn = qp->r_psn;
3177 	/* Queue RNR NAK for later */
3178 	rc_defered_ack(rcd, qp);
3179 	return;
3180 
3181 nack_op_err:
3182 	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
3183 	qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
3184 	qp->r_ack_psn = qp->r_psn;
3185 	/* Queue NAK for later */
3186 	rc_defered_ack(rcd, qp);
3187 	return;
3188 
3189 nack_inv_unlck:
3190 	spin_unlock_irqrestore(&qp->s_lock, flags);
3191 nack_inv:
3192 	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
3193 	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
3194 	qp->r_ack_psn = qp->r_psn;
3195 	/* Queue NAK for later */
3196 	rc_defered_ack(rcd, qp);
3197 	return;
3198 
3199 nack_acc_unlck:
3200 	spin_unlock_irqrestore(&qp->s_lock, flags);
3201 nack_acc:
3202 	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
3203 	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
3204 	qp->r_ack_psn = qp->r_psn;
3205 send_ack:
3206 	hfi1_send_rc_ack(packet, fecn);
3207 }
3208 
hfi1_rc_hdrerr(struct hfi1_ctxtdata * rcd,struct hfi1_packet * packet,struct rvt_qp * qp)3209 void hfi1_rc_hdrerr(
3210 	struct hfi1_ctxtdata *rcd,
3211 	struct hfi1_packet *packet,
3212 	struct rvt_qp *qp)
3213 {
3214 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
3215 	int diff;
3216 	u32 opcode;
3217 	u32 psn;
3218 
3219 	if (hfi1_ruc_check_hdr(ibp, packet))
3220 		return;
3221 
3222 	psn = ib_bth_get_psn(packet->ohdr);
3223 	opcode = ib_bth_get_opcode(packet->ohdr);
3224 
3225 	/* Only deal with RDMA Writes for now */
3226 	if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
3227 		diff = delta_psn(psn, qp->r_psn);
3228 		if (!qp->r_nak_state && diff >= 0) {
3229 			ibp->rvp.n_rc_seqnak++;
3230 			qp->r_nak_state = IB_NAK_PSN_ERROR;
3231 			/* Use the expected PSN. */
3232 			qp->r_ack_psn = qp->r_psn;
3233 			/*
3234 			 * Wait to send the sequence
3235 			 * NAK until all packets
3236 			 * in the receive queue have
3237 			 * been processed.
3238 			 * Otherwise, we end up
3239 			 * propagating congestion.
3240 			 */
3241 			rc_defered_ack(rcd, qp);
3242 		} /* Out of sequence NAK */
3243 	} /* QP Request NAKs */
3244 }
3245