1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3 *
4 * Copyright (C) 2015-2021 Google, Inc.
5 */
6
7 #include "gve.h"
8 #include "gve_adminq.h"
9 #include "gve_utils.h"
10 #include "gve_dqo.h"
11 #include <linux/tcp.h>
12 #include <linux/slab.h>
13 #include <linux/skbuff.h>
14
15 /* Returns true if a gve_tx_pending_packet_dqo object is available. */
gve_has_pending_packet(struct gve_tx_ring * tx)16 static bool gve_has_pending_packet(struct gve_tx_ring *tx)
17 {
18 /* Check TX path's list. */
19 if (tx->dqo_tx.free_pending_packets != -1)
20 return true;
21
22 /* Check completion handler's list. */
23 if (atomic_read_acquire(&tx->dqo_compl.free_pending_packets) != -1)
24 return true;
25
26 return false;
27 }
28
29 static struct gve_tx_pending_packet_dqo *
gve_alloc_pending_packet(struct gve_tx_ring * tx)30 gve_alloc_pending_packet(struct gve_tx_ring *tx)
31 {
32 struct gve_tx_pending_packet_dqo *pending_packet;
33 s16 index;
34
35 index = tx->dqo_tx.free_pending_packets;
36
37 /* No pending_packets available, try to steal the list from the
38 * completion handler.
39 */
40 if (unlikely(index == -1)) {
41 tx->dqo_tx.free_pending_packets =
42 atomic_xchg(&tx->dqo_compl.free_pending_packets, -1);
43 index = tx->dqo_tx.free_pending_packets;
44
45 if (unlikely(index == -1))
46 return NULL;
47 }
48
49 pending_packet = &tx->dqo.pending_packets[index];
50
51 /* Remove pending_packet from free list */
52 tx->dqo_tx.free_pending_packets = pending_packet->next;
53 pending_packet->state = GVE_PACKET_STATE_PENDING_DATA_COMPL;
54
55 return pending_packet;
56 }
57
58 static void
gve_free_pending_packet(struct gve_tx_ring * tx,struct gve_tx_pending_packet_dqo * pending_packet)59 gve_free_pending_packet(struct gve_tx_ring *tx,
60 struct gve_tx_pending_packet_dqo *pending_packet)
61 {
62 s16 index = pending_packet - tx->dqo.pending_packets;
63
64 pending_packet->state = GVE_PACKET_STATE_UNALLOCATED;
65 while (true) {
66 s16 old_head = atomic_read_acquire(&tx->dqo_compl.free_pending_packets);
67
68 pending_packet->next = old_head;
69 if (atomic_cmpxchg(&tx->dqo_compl.free_pending_packets,
70 old_head, index) == old_head) {
71 break;
72 }
73 }
74 }
75
76 /* gve_tx_free_desc - Cleans up all pending tx requests and buffers.
77 */
gve_tx_clean_pending_packets(struct gve_tx_ring * tx)78 static void gve_tx_clean_pending_packets(struct gve_tx_ring *tx)
79 {
80 int i;
81
82 for (i = 0; i < tx->dqo.num_pending_packets; i++) {
83 struct gve_tx_pending_packet_dqo *cur_state =
84 &tx->dqo.pending_packets[i];
85 int j;
86
87 for (j = 0; j < cur_state->num_bufs; j++) {
88 if (j == 0) {
89 dma_unmap_single(tx->dev,
90 dma_unmap_addr(cur_state, dma[j]),
91 dma_unmap_len(cur_state, len[j]),
92 DMA_TO_DEVICE);
93 } else {
94 dma_unmap_page(tx->dev,
95 dma_unmap_addr(cur_state, dma[j]),
96 dma_unmap_len(cur_state, len[j]),
97 DMA_TO_DEVICE);
98 }
99 }
100 if (cur_state->skb) {
101 dev_consume_skb_any(cur_state->skb);
102 cur_state->skb = NULL;
103 }
104 }
105 }
106
gve_tx_free_ring_dqo(struct gve_priv * priv,int idx)107 static void gve_tx_free_ring_dqo(struct gve_priv *priv, int idx)
108 {
109 struct gve_tx_ring *tx = &priv->tx[idx];
110 struct device *hdev = &priv->pdev->dev;
111 size_t bytes;
112
113 gve_tx_remove_from_block(priv, idx);
114
115 if (tx->q_resources) {
116 dma_free_coherent(hdev, sizeof(*tx->q_resources),
117 tx->q_resources, tx->q_resources_bus);
118 tx->q_resources = NULL;
119 }
120
121 if (tx->dqo.compl_ring) {
122 bytes = sizeof(tx->dqo.compl_ring[0]) *
123 (tx->dqo.complq_mask + 1);
124 dma_free_coherent(hdev, bytes, tx->dqo.compl_ring,
125 tx->complq_bus_dqo);
126 tx->dqo.compl_ring = NULL;
127 }
128
129 if (tx->dqo.tx_ring) {
130 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
131 dma_free_coherent(hdev, bytes, tx->dqo.tx_ring, tx->bus);
132 tx->dqo.tx_ring = NULL;
133 }
134
135 kvfree(tx->dqo.pending_packets);
136 tx->dqo.pending_packets = NULL;
137
138 netif_dbg(priv, drv, priv->dev, "freed tx queue %d\n", idx);
139 }
140
gve_tx_alloc_ring_dqo(struct gve_priv * priv,int idx)141 static int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int idx)
142 {
143 struct gve_tx_ring *tx = &priv->tx[idx];
144 struct device *hdev = &priv->pdev->dev;
145 int num_pending_packets;
146 size_t bytes;
147 int i;
148
149 memset(tx, 0, sizeof(*tx));
150 tx->q_num = idx;
151 tx->dev = &priv->pdev->dev;
152 tx->netdev_txq = netdev_get_tx_queue(priv->dev, idx);
153 atomic_set_release(&tx->dqo_compl.hw_tx_head, 0);
154
155 /* Queue sizes must be a power of 2 */
156 tx->mask = priv->tx_desc_cnt - 1;
157 tx->dqo.complq_mask = priv->options_dqo_rda.tx_comp_ring_entries - 1;
158
159 /* The max number of pending packets determines the maximum number of
160 * descriptors which maybe written to the completion queue.
161 *
162 * We must set the number small enough to make sure we never overrun the
163 * completion queue.
164 */
165 num_pending_packets = tx->dqo.complq_mask + 1;
166
167 /* Reserve space for descriptor completions, which will be reported at
168 * most every GVE_TX_MIN_RE_INTERVAL packets.
169 */
170 num_pending_packets -=
171 (tx->dqo.complq_mask + 1) / GVE_TX_MIN_RE_INTERVAL;
172
173 /* Each packet may have at most 2 buffer completions if it receives both
174 * a miss and reinjection completion.
175 */
176 num_pending_packets /= 2;
177
178 tx->dqo.num_pending_packets = min_t(int, num_pending_packets, S16_MAX);
179 tx->dqo.pending_packets = kvcalloc(tx->dqo.num_pending_packets,
180 sizeof(tx->dqo.pending_packets[0]),
181 GFP_KERNEL);
182 if (!tx->dqo.pending_packets)
183 goto err;
184
185 /* Set up linked list of pending packets */
186 for (i = 0; i < tx->dqo.num_pending_packets - 1; i++)
187 tx->dqo.pending_packets[i].next = i + 1;
188
189 tx->dqo.pending_packets[tx->dqo.num_pending_packets - 1].next = -1;
190 atomic_set_release(&tx->dqo_compl.free_pending_packets, -1);
191 tx->dqo_compl.miss_completions.head = -1;
192 tx->dqo_compl.miss_completions.tail = -1;
193 tx->dqo_compl.timed_out_completions.head = -1;
194 tx->dqo_compl.timed_out_completions.tail = -1;
195
196 bytes = sizeof(tx->dqo.tx_ring[0]) * (tx->mask + 1);
197 tx->dqo.tx_ring = dma_alloc_coherent(hdev, bytes, &tx->bus, GFP_KERNEL);
198 if (!tx->dqo.tx_ring)
199 goto err;
200
201 bytes = sizeof(tx->dqo.compl_ring[0]) * (tx->dqo.complq_mask + 1);
202 tx->dqo.compl_ring = dma_alloc_coherent(hdev, bytes,
203 &tx->complq_bus_dqo,
204 GFP_KERNEL);
205 if (!tx->dqo.compl_ring)
206 goto err;
207
208 tx->q_resources = dma_alloc_coherent(hdev, sizeof(*tx->q_resources),
209 &tx->q_resources_bus, GFP_KERNEL);
210 if (!tx->q_resources)
211 goto err;
212
213 gve_tx_add_to_block(priv, idx);
214
215 return 0;
216
217 err:
218 gve_tx_free_ring_dqo(priv, idx);
219 return -ENOMEM;
220 }
221
gve_tx_alloc_rings_dqo(struct gve_priv * priv)222 int gve_tx_alloc_rings_dqo(struct gve_priv *priv)
223 {
224 int err = 0;
225 int i;
226
227 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
228 err = gve_tx_alloc_ring_dqo(priv, i);
229 if (err) {
230 netif_err(priv, drv, priv->dev,
231 "Failed to alloc tx ring=%d: err=%d\n",
232 i, err);
233 goto err;
234 }
235 }
236
237 return 0;
238
239 err:
240 for (i--; i >= 0; i--)
241 gve_tx_free_ring_dqo(priv, i);
242
243 return err;
244 }
245
gve_tx_free_rings_dqo(struct gve_priv * priv)246 void gve_tx_free_rings_dqo(struct gve_priv *priv)
247 {
248 int i;
249
250 for (i = 0; i < priv->tx_cfg.num_queues; i++) {
251 struct gve_tx_ring *tx = &priv->tx[i];
252
253 gve_clean_tx_done_dqo(priv, tx, /*napi=*/NULL);
254 netdev_tx_reset_queue(tx->netdev_txq);
255 gve_tx_clean_pending_packets(tx);
256
257 gve_tx_free_ring_dqo(priv, i);
258 }
259 }
260
261 /* Returns the number of slots available in the ring */
num_avail_tx_slots(const struct gve_tx_ring * tx)262 static u32 num_avail_tx_slots(const struct gve_tx_ring *tx)
263 {
264 u32 num_used = (tx->dqo_tx.tail - tx->dqo_tx.head) & tx->mask;
265
266 return tx->mask - num_used;
267 }
268
269 /* Stops the queue if available descriptors is less than 'count'.
270 * Return: 0 if stop is not required.
271 */
gve_maybe_stop_tx_dqo(struct gve_tx_ring * tx,int count)272 static int gve_maybe_stop_tx_dqo(struct gve_tx_ring *tx, int count)
273 {
274 if (likely(gve_has_pending_packet(tx) &&
275 num_avail_tx_slots(tx) >= count))
276 return 0;
277
278 /* Update cached TX head pointer */
279 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
280
281 if (likely(gve_has_pending_packet(tx) &&
282 num_avail_tx_slots(tx) >= count))
283 return 0;
284
285 /* No space, so stop the queue */
286 tx->stop_queue++;
287 netif_tx_stop_queue(tx->netdev_txq);
288
289 /* Sync with restarting queue in `gve_tx_poll_dqo()` */
290 mb();
291
292 /* After stopping queue, check if we can transmit again in order to
293 * avoid TOCTOU bug.
294 */
295 tx->dqo_tx.head = atomic_read_acquire(&tx->dqo_compl.hw_tx_head);
296
297 if (likely(!gve_has_pending_packet(tx) ||
298 num_avail_tx_slots(tx) < count))
299 return -EBUSY;
300
301 netif_tx_start_queue(tx->netdev_txq);
302 tx->wake_queue++;
303 return 0;
304 }
305
gve_extract_tx_metadata_dqo(const struct sk_buff * skb,struct gve_tx_metadata_dqo * metadata)306 static void gve_extract_tx_metadata_dqo(const struct sk_buff *skb,
307 struct gve_tx_metadata_dqo *metadata)
308 {
309 memset(metadata, 0, sizeof(*metadata));
310 metadata->version = GVE_TX_METADATA_VERSION_DQO;
311
312 if (skb->l4_hash) {
313 u16 path_hash = skb->hash ^ (skb->hash >> 16);
314
315 path_hash &= (1 << 15) - 1;
316 if (unlikely(path_hash == 0))
317 path_hash = ~path_hash;
318
319 metadata->path_hash = path_hash;
320 }
321 }
322
gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring * tx,u32 * desc_idx,struct sk_buff * skb,u32 len,u64 addr,s16 compl_tag,bool eop,bool is_gso)323 static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, u32 *desc_idx,
324 struct sk_buff *skb, u32 len, u64 addr,
325 s16 compl_tag, bool eop, bool is_gso)
326 {
327 const bool checksum_offload_en = skb->ip_summed == CHECKSUM_PARTIAL;
328
329 while (len > 0) {
330 struct gve_tx_pkt_desc_dqo *desc =
331 &tx->dqo.tx_ring[*desc_idx].pkt;
332 u32 cur_len = min_t(u32, len, GVE_TX_MAX_BUF_SIZE_DQO);
333 bool cur_eop = eop && cur_len == len;
334
335 *desc = (struct gve_tx_pkt_desc_dqo){
336 .buf_addr = cpu_to_le64(addr),
337 .dtype = GVE_TX_PKT_DESC_DTYPE_DQO,
338 .end_of_packet = cur_eop,
339 .checksum_offload_enable = checksum_offload_en,
340 .compl_tag = cpu_to_le16(compl_tag),
341 .buf_size = cur_len,
342 };
343
344 addr += cur_len;
345 len -= cur_len;
346 *desc_idx = (*desc_idx + 1) & tx->mask;
347 }
348 }
349
350 /* Validates and prepares `skb` for TSO.
351 *
352 * Returns header length, or < 0 if invalid.
353 */
gve_prep_tso(struct sk_buff * skb)354 static int gve_prep_tso(struct sk_buff *skb)
355 {
356 struct tcphdr *tcp;
357 int header_len;
358 u32 paylen;
359 int err;
360
361 /* Note: HW requires MSS (gso_size) to be <= 9728 and the total length
362 * of the TSO to be <= 262143.
363 *
364 * However, we don't validate these because:
365 * - Hypervisor enforces a limit of 9K MTU
366 * - Kernel will not produce a TSO larger than 64k
367 */
368
369 if (unlikely(skb_shinfo(skb)->gso_size < GVE_TX_MIN_TSO_MSS_DQO))
370 return -1;
371
372 /* Needed because we will modify header. */
373 err = skb_cow_head(skb, 0);
374 if (err < 0)
375 return err;
376
377 tcp = tcp_hdr(skb);
378
379 /* Remove payload length from checksum. */
380 paylen = skb->len - skb_transport_offset(skb);
381
382 switch (skb_shinfo(skb)->gso_type) {
383 case SKB_GSO_TCPV4:
384 case SKB_GSO_TCPV6:
385 csum_replace_by_diff(&tcp->check,
386 (__force __wsum)htonl(paylen));
387
388 /* Compute length of segmentation header. */
389 header_len = skb_tcp_all_headers(skb);
390 break;
391 default:
392 return -EINVAL;
393 }
394
395 if (unlikely(header_len > GVE_TX_MAX_HDR_SIZE_DQO))
396 return -EINVAL;
397
398 return header_len;
399 }
400
gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo * desc,const struct sk_buff * skb,const struct gve_tx_metadata_dqo * metadata,int header_len)401 static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc,
402 const struct sk_buff *skb,
403 const struct gve_tx_metadata_dqo *metadata,
404 int header_len)
405 {
406 *desc = (struct gve_tx_tso_context_desc_dqo){
407 .header_len = header_len,
408 .cmd_dtype = {
409 .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO,
410 .tso = 1,
411 },
412 .flex0 = metadata->bytes[0],
413 .flex5 = metadata->bytes[5],
414 .flex6 = metadata->bytes[6],
415 .flex7 = metadata->bytes[7],
416 .flex8 = metadata->bytes[8],
417 .flex9 = metadata->bytes[9],
418 .flex10 = metadata->bytes[10],
419 .flex11 = metadata->bytes[11],
420 };
421 desc->tso_total_len = skb->len - header_len;
422 desc->mss = skb_shinfo(skb)->gso_size;
423 }
424
425 static void
gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo * desc,const struct gve_tx_metadata_dqo * metadata)426 gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc,
427 const struct gve_tx_metadata_dqo *metadata)
428 {
429 *desc = (struct gve_tx_general_context_desc_dqo){
430 .flex0 = metadata->bytes[0],
431 .flex1 = metadata->bytes[1],
432 .flex2 = metadata->bytes[2],
433 .flex3 = metadata->bytes[3],
434 .flex4 = metadata->bytes[4],
435 .flex5 = metadata->bytes[5],
436 .flex6 = metadata->bytes[6],
437 .flex7 = metadata->bytes[7],
438 .flex8 = metadata->bytes[8],
439 .flex9 = metadata->bytes[9],
440 .flex10 = metadata->bytes[10],
441 .flex11 = metadata->bytes[11],
442 .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO},
443 };
444 }
445
446 /* Returns 0 on success, or < 0 on error.
447 *
448 * Before this function is called, the caller must ensure
449 * gve_has_pending_packet(tx) returns true.
450 */
gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring * tx,struct sk_buff * skb)451 static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx,
452 struct sk_buff *skb)
453 {
454 const struct skb_shared_info *shinfo = skb_shinfo(skb);
455 const bool is_gso = skb_is_gso(skb);
456 u32 desc_idx = tx->dqo_tx.tail;
457
458 struct gve_tx_pending_packet_dqo *pkt;
459 struct gve_tx_metadata_dqo metadata;
460 s16 completion_tag;
461 int i;
462
463 pkt = gve_alloc_pending_packet(tx);
464 pkt->skb = skb;
465 pkt->num_bufs = 0;
466 completion_tag = pkt - tx->dqo.pending_packets;
467
468 gve_extract_tx_metadata_dqo(skb, &metadata);
469 if (is_gso) {
470 int header_len = gve_prep_tso(skb);
471
472 if (unlikely(header_len < 0))
473 goto err;
474
475 gve_tx_fill_tso_ctx_desc(&tx->dqo.tx_ring[desc_idx].tso_ctx,
476 skb, &metadata, header_len);
477 desc_idx = (desc_idx + 1) & tx->mask;
478 }
479
480 gve_tx_fill_general_ctx_desc(&tx->dqo.tx_ring[desc_idx].general_ctx,
481 &metadata);
482 desc_idx = (desc_idx + 1) & tx->mask;
483
484 /* Note: HW requires that the size of a non-TSO packet be within the
485 * range of [17, 9728].
486 *
487 * We don't double check because
488 * - We limited `netdev->min_mtu` to ETH_MIN_MTU.
489 * - Hypervisor won't allow MTU larger than 9216.
490 */
491
492 /* Map the linear portion of skb */
493 {
494 u32 len = skb_headlen(skb);
495 dma_addr_t addr;
496
497 addr = dma_map_single(tx->dev, skb->data, len, DMA_TO_DEVICE);
498 if (unlikely(dma_mapping_error(tx->dev, addr)))
499 goto err;
500
501 dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
502 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
503 ++pkt->num_bufs;
504
505 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
506 completion_tag,
507 /*eop=*/shinfo->nr_frags == 0, is_gso);
508 }
509
510 for (i = 0; i < shinfo->nr_frags; i++) {
511 const skb_frag_t *frag = &shinfo->frags[i];
512 bool is_eop = i == (shinfo->nr_frags - 1);
513 u32 len = skb_frag_size(frag);
514 dma_addr_t addr;
515
516 addr = skb_frag_dma_map(tx->dev, frag, 0, len, DMA_TO_DEVICE);
517 if (unlikely(dma_mapping_error(tx->dev, addr)))
518 goto err;
519
520 dma_unmap_len_set(pkt, len[pkt->num_bufs], len);
521 dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr);
522 ++pkt->num_bufs;
523
524 gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, skb, len, addr,
525 completion_tag, is_eop, is_gso);
526 }
527
528 /* Commit the changes to our state */
529 tx->dqo_tx.tail = desc_idx;
530
531 /* Request a descriptor completion on the last descriptor of the
532 * packet if we are allowed to by the HW enforced interval.
533 */
534 {
535 u32 last_desc_idx = (desc_idx - 1) & tx->mask;
536 u32 last_report_event_interval =
537 (last_desc_idx - tx->dqo_tx.last_re_idx) & tx->mask;
538
539 if (unlikely(last_report_event_interval >=
540 GVE_TX_MIN_RE_INTERVAL)) {
541 tx->dqo.tx_ring[last_desc_idx].pkt.report_event = true;
542 tx->dqo_tx.last_re_idx = last_desc_idx;
543 }
544 }
545
546 return 0;
547
548 err:
549 for (i = 0; i < pkt->num_bufs; i++) {
550 if (i == 0) {
551 dma_unmap_single(tx->dev,
552 dma_unmap_addr(pkt, dma[i]),
553 dma_unmap_len(pkt, len[i]),
554 DMA_TO_DEVICE);
555 } else {
556 dma_unmap_page(tx->dev,
557 dma_unmap_addr(pkt, dma[i]),
558 dma_unmap_len(pkt, len[i]),
559 DMA_TO_DEVICE);
560 }
561 }
562
563 pkt->skb = NULL;
564 pkt->num_bufs = 0;
565 gve_free_pending_packet(tx, pkt);
566
567 return -1;
568 }
569
gve_num_descs_per_buf(size_t size)570 static int gve_num_descs_per_buf(size_t size)
571 {
572 return DIV_ROUND_UP(size, GVE_TX_MAX_BUF_SIZE_DQO);
573 }
574
gve_num_buffer_descs_needed(const struct sk_buff * skb)575 static int gve_num_buffer_descs_needed(const struct sk_buff *skb)
576 {
577 const struct skb_shared_info *shinfo = skb_shinfo(skb);
578 int num_descs;
579 int i;
580
581 num_descs = gve_num_descs_per_buf(skb_headlen(skb));
582
583 for (i = 0; i < shinfo->nr_frags; i++) {
584 unsigned int frag_size = skb_frag_size(&shinfo->frags[i]);
585
586 num_descs += gve_num_descs_per_buf(frag_size);
587 }
588
589 return num_descs;
590 }
591
592 /* Returns true if HW is capable of sending TSO represented by `skb`.
593 *
594 * Each segment must not span more than GVE_TX_MAX_DATA_DESCS buffers.
595 * - The header is counted as one buffer for every single segment.
596 * - A buffer which is split between two segments is counted for both.
597 * - If a buffer contains both header and payload, it is counted as two buffers.
598 */
gve_can_send_tso(const struct sk_buff * skb)599 static bool gve_can_send_tso(const struct sk_buff *skb)
600 {
601 const int max_bufs_per_seg = GVE_TX_MAX_DATA_DESCS - 1;
602 const struct skb_shared_info *shinfo = skb_shinfo(skb);
603 const int header_len = skb_tcp_all_headers(skb);
604 const int gso_size = shinfo->gso_size;
605 int cur_seg_num_bufs;
606 int cur_seg_size;
607 int i;
608
609 cur_seg_size = skb_headlen(skb) - header_len;
610 cur_seg_num_bufs = cur_seg_size > 0;
611
612 for (i = 0; i < shinfo->nr_frags; i++) {
613 if (cur_seg_size >= gso_size) {
614 cur_seg_size %= gso_size;
615 cur_seg_num_bufs = cur_seg_size > 0;
616 }
617
618 if (unlikely(++cur_seg_num_bufs > max_bufs_per_seg))
619 return false;
620
621 cur_seg_size += skb_frag_size(&shinfo->frags[i]);
622 }
623
624 return true;
625 }
626
627 /* Attempt to transmit specified SKB.
628 *
629 * Returns 0 if the SKB was transmitted or dropped.
630 * Returns -1 if there is not currently enough space to transmit the SKB.
631 */
gve_try_tx_skb(struct gve_priv * priv,struct gve_tx_ring * tx,struct sk_buff * skb)632 static int gve_try_tx_skb(struct gve_priv *priv, struct gve_tx_ring *tx,
633 struct sk_buff *skb)
634 {
635 int num_buffer_descs;
636 int total_num_descs;
637
638 if (skb_is_gso(skb)) {
639 /* If TSO doesn't meet HW requirements, attempt to linearize the
640 * packet.
641 */
642 if (unlikely(!gve_can_send_tso(skb) &&
643 skb_linearize(skb) < 0)) {
644 net_err_ratelimited("%s: Failed to transmit TSO packet\n",
645 priv->dev->name);
646 goto drop;
647 }
648
649 num_buffer_descs = gve_num_buffer_descs_needed(skb);
650 } else {
651 num_buffer_descs = gve_num_buffer_descs_needed(skb);
652
653 if (unlikely(num_buffer_descs > GVE_TX_MAX_DATA_DESCS)) {
654 if (unlikely(skb_linearize(skb) < 0))
655 goto drop;
656
657 num_buffer_descs = 1;
658 }
659 }
660
661 /* Metadata + (optional TSO) + data descriptors. */
662 total_num_descs = 1 + skb_is_gso(skb) + num_buffer_descs;
663 if (unlikely(gve_maybe_stop_tx_dqo(tx, total_num_descs +
664 GVE_TX_MIN_DESC_PREVENT_CACHE_OVERLAP))) {
665 return -1;
666 }
667
668 if (unlikely(gve_tx_add_skb_no_copy_dqo(tx, skb) < 0))
669 goto drop;
670
671 netdev_tx_sent_queue(tx->netdev_txq, skb->len);
672 skb_tx_timestamp(skb);
673 return 0;
674
675 drop:
676 tx->dropped_pkt++;
677 dev_kfree_skb_any(skb);
678 return 0;
679 }
680
681 /* Transmit a given skb and ring the doorbell. */
gve_tx_dqo(struct sk_buff * skb,struct net_device * dev)682 netdev_tx_t gve_tx_dqo(struct sk_buff *skb, struct net_device *dev)
683 {
684 struct gve_priv *priv = netdev_priv(dev);
685 struct gve_tx_ring *tx;
686
687 tx = &priv->tx[skb_get_queue_mapping(skb)];
688 if (unlikely(gve_try_tx_skb(priv, tx, skb) < 0)) {
689 /* We need to ring the txq doorbell -- we have stopped the Tx
690 * queue for want of resources, but prior calls to gve_tx()
691 * may have added descriptors without ringing the doorbell.
692 */
693 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
694 return NETDEV_TX_BUSY;
695 }
696
697 if (!netif_xmit_stopped(tx->netdev_txq) && netdev_xmit_more())
698 return NETDEV_TX_OK;
699
700 gve_tx_put_doorbell_dqo(priv, tx->q_resources, tx->dqo_tx.tail);
701 return NETDEV_TX_OK;
702 }
703
add_to_list(struct gve_tx_ring * tx,struct gve_index_list * list,struct gve_tx_pending_packet_dqo * pending_packet)704 static void add_to_list(struct gve_tx_ring *tx, struct gve_index_list *list,
705 struct gve_tx_pending_packet_dqo *pending_packet)
706 {
707 s16 old_tail, index;
708
709 index = pending_packet - tx->dqo.pending_packets;
710 old_tail = list->tail;
711 list->tail = index;
712 if (old_tail == -1)
713 list->head = index;
714 else
715 tx->dqo.pending_packets[old_tail].next = index;
716
717 pending_packet->next = -1;
718 pending_packet->prev = old_tail;
719 }
720
remove_from_list(struct gve_tx_ring * tx,struct gve_index_list * list,struct gve_tx_pending_packet_dqo * pkt)721 static void remove_from_list(struct gve_tx_ring *tx,
722 struct gve_index_list *list,
723 struct gve_tx_pending_packet_dqo *pkt)
724 {
725 s16 prev_index, next_index;
726
727 prev_index = pkt->prev;
728 next_index = pkt->next;
729
730 if (prev_index == -1) {
731 /* Node is head */
732 list->head = next_index;
733 } else {
734 tx->dqo.pending_packets[prev_index].next = next_index;
735 }
736 if (next_index == -1) {
737 /* Node is tail */
738 list->tail = prev_index;
739 } else {
740 tx->dqo.pending_packets[next_index].prev = prev_index;
741 }
742 }
743
gve_unmap_packet(struct device * dev,struct gve_tx_pending_packet_dqo * pkt)744 static void gve_unmap_packet(struct device *dev,
745 struct gve_tx_pending_packet_dqo *pkt)
746 {
747 int i;
748
749 /* SKB linear portion is guaranteed to be mapped */
750 dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]),
751 dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE);
752 for (i = 1; i < pkt->num_bufs; i++) {
753 dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]),
754 dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE);
755 }
756 pkt->num_bufs = 0;
757 }
758
759 /* Completion types and expected behavior:
760 * No Miss compl + Packet compl = Packet completed normally.
761 * Miss compl + Re-inject compl = Packet completed normally.
762 * No Miss compl + Re-inject compl = Skipped i.e. packet not completed.
763 * Miss compl + Packet compl = Skipped i.e. packet not completed.
764 */
gve_handle_packet_completion(struct gve_priv * priv,struct gve_tx_ring * tx,bool is_napi,u16 compl_tag,u64 * bytes,u64 * pkts,bool is_reinjection)765 static void gve_handle_packet_completion(struct gve_priv *priv,
766 struct gve_tx_ring *tx, bool is_napi,
767 u16 compl_tag, u64 *bytes, u64 *pkts,
768 bool is_reinjection)
769 {
770 struct gve_tx_pending_packet_dqo *pending_packet;
771
772 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
773 net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
774 priv->dev->name, (int)compl_tag);
775 return;
776 }
777
778 pending_packet = &tx->dqo.pending_packets[compl_tag];
779
780 if (unlikely(is_reinjection)) {
781 if (unlikely(pending_packet->state ==
782 GVE_PACKET_STATE_TIMED_OUT_COMPL)) {
783 net_err_ratelimited("%s: Re-injection completion: %d received after timeout.\n",
784 priv->dev->name, (int)compl_tag);
785 /* Packet was already completed as a result of timeout,
786 * so just remove from list and free pending packet.
787 */
788 remove_from_list(tx,
789 &tx->dqo_compl.timed_out_completions,
790 pending_packet);
791 gve_free_pending_packet(tx, pending_packet);
792 return;
793 }
794 if (unlikely(pending_packet->state !=
795 GVE_PACKET_STATE_PENDING_REINJECT_COMPL)) {
796 /* No outstanding miss completion but packet allocated
797 * implies packet receives a re-injection completion
798 * without a prior miss completion. Return without
799 * completing the packet.
800 */
801 net_err_ratelimited("%s: Re-injection completion received without corresponding miss completion: %d\n",
802 priv->dev->name, (int)compl_tag);
803 return;
804 }
805 remove_from_list(tx, &tx->dqo_compl.miss_completions,
806 pending_packet);
807 } else {
808 /* Packet is allocated but not a pending data completion. */
809 if (unlikely(pending_packet->state !=
810 GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
811 net_err_ratelimited("%s: No pending data completion: %d\n",
812 priv->dev->name, (int)compl_tag);
813 return;
814 }
815 }
816 gve_unmap_packet(tx->dev, pending_packet);
817
818 *bytes += pending_packet->skb->len;
819 (*pkts)++;
820 napi_consume_skb(pending_packet->skb, is_napi);
821 pending_packet->skb = NULL;
822 gve_free_pending_packet(tx, pending_packet);
823 }
824
gve_handle_miss_completion(struct gve_priv * priv,struct gve_tx_ring * tx,u16 compl_tag,u64 * bytes,u64 * pkts)825 static void gve_handle_miss_completion(struct gve_priv *priv,
826 struct gve_tx_ring *tx, u16 compl_tag,
827 u64 *bytes, u64 *pkts)
828 {
829 struct gve_tx_pending_packet_dqo *pending_packet;
830
831 if (unlikely(compl_tag >= tx->dqo.num_pending_packets)) {
832 net_err_ratelimited("%s: Invalid TX completion tag: %d\n",
833 priv->dev->name, (int)compl_tag);
834 return;
835 }
836
837 pending_packet = &tx->dqo.pending_packets[compl_tag];
838 if (unlikely(pending_packet->state !=
839 GVE_PACKET_STATE_PENDING_DATA_COMPL)) {
840 net_err_ratelimited("%s: Unexpected packet state: %d for completion tag : %d\n",
841 priv->dev->name, (int)pending_packet->state,
842 (int)compl_tag);
843 return;
844 }
845
846 pending_packet->state = GVE_PACKET_STATE_PENDING_REINJECT_COMPL;
847 /* jiffies can wraparound but time comparisons can handle overflows. */
848 pending_packet->timeout_jiffies =
849 jiffies +
850 msecs_to_jiffies(GVE_REINJECT_COMPL_TIMEOUT *
851 MSEC_PER_SEC);
852 add_to_list(tx, &tx->dqo_compl.miss_completions, pending_packet);
853
854 *bytes += pending_packet->skb->len;
855 (*pkts)++;
856 }
857
remove_miss_completions(struct gve_priv * priv,struct gve_tx_ring * tx)858 static void remove_miss_completions(struct gve_priv *priv,
859 struct gve_tx_ring *tx)
860 {
861 struct gve_tx_pending_packet_dqo *pending_packet;
862 s16 next_index;
863
864 next_index = tx->dqo_compl.miss_completions.head;
865 while (next_index != -1) {
866 pending_packet = &tx->dqo.pending_packets[next_index];
867 next_index = pending_packet->next;
868 /* Break early because packets should timeout in order. */
869 if (time_is_after_jiffies(pending_packet->timeout_jiffies))
870 break;
871
872 remove_from_list(tx, &tx->dqo_compl.miss_completions,
873 pending_packet);
874 /* Unmap buffers and free skb but do not unallocate packet i.e.
875 * the completion tag is not freed to ensure that the driver
876 * can take appropriate action if a corresponding valid
877 * completion is received later.
878 */
879 gve_unmap_packet(tx->dev, pending_packet);
880 /* This indicates the packet was dropped. */
881 dev_kfree_skb_any(pending_packet->skb);
882 pending_packet->skb = NULL;
883 tx->dropped_pkt++;
884 net_err_ratelimited("%s: No reinjection completion was received for: %d.\n",
885 priv->dev->name,
886 (int)(pending_packet - tx->dqo.pending_packets));
887
888 pending_packet->state = GVE_PACKET_STATE_TIMED_OUT_COMPL;
889 pending_packet->timeout_jiffies =
890 jiffies +
891 msecs_to_jiffies(GVE_DEALLOCATE_COMPL_TIMEOUT *
892 MSEC_PER_SEC);
893 /* Maintain pending packet in another list so the packet can be
894 * unallocated at a later time.
895 */
896 add_to_list(tx, &tx->dqo_compl.timed_out_completions,
897 pending_packet);
898 }
899 }
900
remove_timed_out_completions(struct gve_priv * priv,struct gve_tx_ring * tx)901 static void remove_timed_out_completions(struct gve_priv *priv,
902 struct gve_tx_ring *tx)
903 {
904 struct gve_tx_pending_packet_dqo *pending_packet;
905 s16 next_index;
906
907 next_index = tx->dqo_compl.timed_out_completions.head;
908 while (next_index != -1) {
909 pending_packet = &tx->dqo.pending_packets[next_index];
910 next_index = pending_packet->next;
911 /* Break early because packets should timeout in order. */
912 if (time_is_after_jiffies(pending_packet->timeout_jiffies))
913 break;
914
915 remove_from_list(tx, &tx->dqo_compl.timed_out_completions,
916 pending_packet);
917 gve_free_pending_packet(tx, pending_packet);
918 }
919 }
920
gve_clean_tx_done_dqo(struct gve_priv * priv,struct gve_tx_ring * tx,struct napi_struct * napi)921 int gve_clean_tx_done_dqo(struct gve_priv *priv, struct gve_tx_ring *tx,
922 struct napi_struct *napi)
923 {
924 u64 reinject_compl_bytes = 0;
925 u64 reinject_compl_pkts = 0;
926 int num_descs_cleaned = 0;
927 u64 miss_compl_bytes = 0;
928 u64 miss_compl_pkts = 0;
929 u64 pkt_compl_bytes = 0;
930 u64 pkt_compl_pkts = 0;
931
932 /* Limit in order to avoid blocking for too long */
933 while (!napi || pkt_compl_pkts < napi->weight) {
934 struct gve_tx_compl_desc *compl_desc =
935 &tx->dqo.compl_ring[tx->dqo_compl.head];
936 u16 type;
937
938 if (compl_desc->generation == tx->dqo_compl.cur_gen_bit)
939 break;
940
941 /* Prefetch the next descriptor. */
942 prefetch(&tx->dqo.compl_ring[(tx->dqo_compl.head + 1) &
943 tx->dqo.complq_mask]);
944
945 /* Do not read data until we own the descriptor */
946 dma_rmb();
947 type = compl_desc->type;
948
949 if (type == GVE_COMPL_TYPE_DQO_DESC) {
950 /* This is the last descriptor fetched by HW plus one */
951 u16 tx_head = le16_to_cpu(compl_desc->tx_head);
952
953 atomic_set_release(&tx->dqo_compl.hw_tx_head, tx_head);
954 } else if (type == GVE_COMPL_TYPE_DQO_PKT) {
955 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
956
957 gve_handle_packet_completion(priv, tx, !!napi,
958 compl_tag,
959 &pkt_compl_bytes,
960 &pkt_compl_pkts,
961 /*is_reinjection=*/false);
962 } else if (type == GVE_COMPL_TYPE_DQO_MISS) {
963 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
964
965 gve_handle_miss_completion(priv, tx, compl_tag,
966 &miss_compl_bytes,
967 &miss_compl_pkts);
968 } else if (type == GVE_COMPL_TYPE_DQO_REINJECTION) {
969 u16 compl_tag = le16_to_cpu(compl_desc->completion_tag);
970
971 gve_handle_packet_completion(priv, tx, !!napi,
972 compl_tag,
973 &reinject_compl_bytes,
974 &reinject_compl_pkts,
975 /*is_reinjection=*/true);
976 }
977
978 tx->dqo_compl.head =
979 (tx->dqo_compl.head + 1) & tx->dqo.complq_mask;
980 /* Flip the generation bit when we wrap around */
981 tx->dqo_compl.cur_gen_bit ^= tx->dqo_compl.head == 0;
982 num_descs_cleaned++;
983 }
984
985 netdev_tx_completed_queue(tx->netdev_txq,
986 pkt_compl_pkts + miss_compl_pkts,
987 pkt_compl_bytes + miss_compl_bytes);
988
989 remove_miss_completions(priv, tx);
990 remove_timed_out_completions(priv, tx);
991
992 u64_stats_update_begin(&tx->statss);
993 tx->bytes_done += pkt_compl_bytes + reinject_compl_bytes;
994 tx->pkt_done += pkt_compl_pkts + reinject_compl_pkts;
995 u64_stats_update_end(&tx->statss);
996 return num_descs_cleaned;
997 }
998
gve_tx_poll_dqo(struct gve_notify_block * block,bool do_clean)999 bool gve_tx_poll_dqo(struct gve_notify_block *block, bool do_clean)
1000 {
1001 struct gve_tx_compl_desc *compl_desc;
1002 struct gve_tx_ring *tx = block->tx;
1003 struct gve_priv *priv = block->priv;
1004
1005 if (do_clean) {
1006 int num_descs_cleaned = gve_clean_tx_done_dqo(priv, tx,
1007 &block->napi);
1008
1009 /* Sync with queue being stopped in `gve_maybe_stop_tx_dqo()` */
1010 mb();
1011
1012 if (netif_tx_queue_stopped(tx->netdev_txq) &&
1013 num_descs_cleaned > 0) {
1014 tx->wake_queue++;
1015 netif_tx_wake_queue(tx->netdev_txq);
1016 }
1017 }
1018
1019 /* Return true if we still have work. */
1020 compl_desc = &tx->dqo.compl_ring[tx->dqo_compl.head];
1021 return compl_desc->generation != tx->dqo_compl.cur_gen_bit;
1022 }
1023