1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3 *
4 * AF_XDP sockets allows a channel between XDP programs and userspace
5 * applications.
6 * Copyright(c) 2018 Intel Corporation.
7 *
8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
9 * Magnus Karlsson <magnus.karlsson@intel.com>
10 */
11
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <net/xdp_sock_drv.h>
26 #include <net/busy_poll.h>
27 #include <net/xdp.h>
28
29 #include "xsk_queue.h"
30 #include "xdp_umem.h"
31 #include "xsk.h"
32
33 #define TX_BATCH_SIZE 32
34
35 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
36
xsk_set_rx_need_wakeup(struct xsk_buff_pool * pool)37 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
38 {
39 if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
40 return;
41
42 pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
43 pool->cached_need_wakeup |= XDP_WAKEUP_RX;
44 }
45 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
46
xsk_set_tx_need_wakeup(struct xsk_buff_pool * pool)47 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
48 {
49 struct xdp_sock *xs;
50
51 if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
52 return;
53
54 rcu_read_lock();
55 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
56 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
57 }
58 rcu_read_unlock();
59
60 pool->cached_need_wakeup |= XDP_WAKEUP_TX;
61 }
62 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
63
xsk_clear_rx_need_wakeup(struct xsk_buff_pool * pool)64 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
65 {
66 if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
67 return;
68
69 pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
70 pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
71 }
72 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
73
xsk_clear_tx_need_wakeup(struct xsk_buff_pool * pool)74 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
75 {
76 struct xdp_sock *xs;
77
78 if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
79 return;
80
81 rcu_read_lock();
82 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
83 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
84 }
85 rcu_read_unlock();
86
87 pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
88 }
89 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
90
xsk_uses_need_wakeup(struct xsk_buff_pool * pool)91 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
92 {
93 return pool->uses_need_wakeup;
94 }
95 EXPORT_SYMBOL(xsk_uses_need_wakeup);
96
xsk_get_pool_from_qid(struct net_device * dev,u16 queue_id)97 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
98 u16 queue_id)
99 {
100 if (queue_id < dev->real_num_rx_queues)
101 return dev->_rx[queue_id].pool;
102 if (queue_id < dev->real_num_tx_queues)
103 return dev->_tx[queue_id].pool;
104
105 return NULL;
106 }
107 EXPORT_SYMBOL(xsk_get_pool_from_qid);
108
xsk_clear_pool_at_qid(struct net_device * dev,u16 queue_id)109 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
110 {
111 if (queue_id < dev->num_rx_queues)
112 dev->_rx[queue_id].pool = NULL;
113 if (queue_id < dev->num_tx_queues)
114 dev->_tx[queue_id].pool = NULL;
115 }
116
117 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
118 * not know if the device has more tx queues than rx, or the opposite.
119 * This might also change during run time.
120 */
xsk_reg_pool_at_qid(struct net_device * dev,struct xsk_buff_pool * pool,u16 queue_id)121 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
122 u16 queue_id)
123 {
124 if (queue_id >= max_t(unsigned int,
125 dev->real_num_rx_queues,
126 dev->real_num_tx_queues))
127 return -EINVAL;
128
129 if (queue_id < dev->real_num_rx_queues)
130 dev->_rx[queue_id].pool = pool;
131 if (queue_id < dev->real_num_tx_queues)
132 dev->_tx[queue_id].pool = pool;
133
134 return 0;
135 }
136
__xsk_rcv_zc(struct xdp_sock * xs,struct xdp_buff * xdp,u32 len)137 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
138 {
139 struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
140 u64 addr;
141 int err;
142
143 addr = xp_get_handle(xskb);
144 err = xskq_prod_reserve_desc(xs->rx, addr, len);
145 if (err) {
146 xs->rx_queue_full++;
147 return err;
148 }
149
150 xp_release(xskb);
151 return 0;
152 }
153
xsk_copy_xdp(struct xdp_buff * to,struct xdp_buff * from,u32 len)154 static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
155 {
156 void *from_buf, *to_buf;
157 u32 metalen;
158
159 if (unlikely(xdp_data_meta_unsupported(from))) {
160 from_buf = from->data;
161 to_buf = to->data;
162 metalen = 0;
163 } else {
164 from_buf = from->data_meta;
165 metalen = from->data - from->data_meta;
166 to_buf = to->data - metalen;
167 }
168
169 memcpy(to_buf, from_buf, len + metalen);
170 }
171
__xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)172 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
173 {
174 struct xdp_buff *xsk_xdp;
175 int err;
176 u32 len;
177
178 len = xdp->data_end - xdp->data;
179 if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
180 xs->rx_dropped++;
181 return -ENOSPC;
182 }
183
184 xsk_xdp = xsk_buff_alloc(xs->pool);
185 if (!xsk_xdp) {
186 xs->rx_dropped++;
187 return -ENOSPC;
188 }
189
190 xsk_copy_xdp(xsk_xdp, xdp, len);
191 err = __xsk_rcv_zc(xs, xsk_xdp, len);
192 if (err) {
193 xsk_buff_free(xsk_xdp);
194 return err;
195 }
196 return 0;
197 }
198
xsk_tx_writeable(struct xdp_sock * xs)199 static bool xsk_tx_writeable(struct xdp_sock *xs)
200 {
201 if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
202 return false;
203
204 return true;
205 }
206
xsk_is_bound(struct xdp_sock * xs)207 static bool xsk_is_bound(struct xdp_sock *xs)
208 {
209 if (READ_ONCE(xs->state) == XSK_BOUND) {
210 /* Matches smp_wmb() in bind(). */
211 smp_rmb();
212 return true;
213 }
214 return false;
215 }
216
xsk_rcv_check(struct xdp_sock * xs,struct xdp_buff * xdp)217 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
218 {
219 if (!xsk_is_bound(xs))
220 return -EINVAL;
221
222 if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
223 return -EINVAL;
224
225 sk_mark_napi_id_once_xdp(&xs->sk, xdp);
226 return 0;
227 }
228
xsk_flush(struct xdp_sock * xs)229 static void xsk_flush(struct xdp_sock *xs)
230 {
231 xskq_prod_submit(xs->rx);
232 __xskq_cons_release(xs->pool->fq);
233 sock_def_readable(&xs->sk);
234 }
235
xsk_generic_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)236 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
237 {
238 int err;
239
240 spin_lock_bh(&xs->rx_lock);
241 err = xsk_rcv_check(xs, xdp);
242 if (!err) {
243 err = __xsk_rcv(xs, xdp);
244 xsk_flush(xs);
245 }
246 spin_unlock_bh(&xs->rx_lock);
247 return err;
248 }
249
xsk_rcv(struct xdp_sock * xs,struct xdp_buff * xdp)250 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
251 {
252 int err;
253 u32 len;
254
255 err = xsk_rcv_check(xs, xdp);
256 if (err)
257 return err;
258
259 if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
260 len = xdp->data_end - xdp->data;
261 return __xsk_rcv_zc(xs, xdp, len);
262 }
263
264 err = __xsk_rcv(xs, xdp);
265 if (!err)
266 xdp_return_buff(xdp);
267 return err;
268 }
269
__xsk_map_redirect(struct xdp_sock * xs,struct xdp_buff * xdp)270 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
271 {
272 struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
273 int err;
274
275 err = xsk_rcv(xs, xdp);
276 if (err)
277 return err;
278
279 if (!xs->flush_node.prev)
280 list_add(&xs->flush_node, flush_list);
281
282 return 0;
283 }
284
__xsk_map_flush(void)285 void __xsk_map_flush(void)
286 {
287 struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
288 struct xdp_sock *xs, *tmp;
289
290 list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
291 xsk_flush(xs);
292 __list_del_clearprev(&xs->flush_node);
293 }
294 }
295
xsk_tx_completed(struct xsk_buff_pool * pool,u32 nb_entries)296 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
297 {
298 xskq_prod_submit_n(pool->cq, nb_entries);
299 }
300 EXPORT_SYMBOL(xsk_tx_completed);
301
xsk_tx_release(struct xsk_buff_pool * pool)302 void xsk_tx_release(struct xsk_buff_pool *pool)
303 {
304 struct xdp_sock *xs;
305
306 rcu_read_lock();
307 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
308 __xskq_cons_release(xs->tx);
309 if (xsk_tx_writeable(xs))
310 xs->sk.sk_write_space(&xs->sk);
311 }
312 rcu_read_unlock();
313 }
314 EXPORT_SYMBOL(xsk_tx_release);
315
xsk_tx_peek_desc(struct xsk_buff_pool * pool,struct xdp_desc * desc)316 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
317 {
318 struct xdp_sock *xs;
319
320 rcu_read_lock();
321 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
322 if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
323 xs->tx->queue_empty_descs++;
324 continue;
325 }
326
327 /* This is the backpressure mechanism for the Tx path.
328 * Reserve space in the completion queue and only proceed
329 * if there is space in it. This avoids having to implement
330 * any buffering in the Tx path.
331 */
332 if (xskq_prod_reserve_addr(pool->cq, desc->addr))
333 goto out;
334
335 xskq_cons_release(xs->tx);
336 rcu_read_unlock();
337 return true;
338 }
339
340 out:
341 rcu_read_unlock();
342 return false;
343 }
344 EXPORT_SYMBOL(xsk_tx_peek_desc);
345
xsk_tx_peek_release_fallback(struct xsk_buff_pool * pool,struct xdp_desc * descs,u32 max_entries)346 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
347 u32 max_entries)
348 {
349 u32 nb_pkts = 0;
350
351 while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
352 nb_pkts++;
353
354 xsk_tx_release(pool);
355 return nb_pkts;
356 }
357
xsk_tx_peek_release_desc_batch(struct xsk_buff_pool * pool,struct xdp_desc * descs,u32 max_entries)358 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
359 u32 max_entries)
360 {
361 struct xdp_sock *xs;
362 u32 nb_pkts;
363
364 rcu_read_lock();
365 if (!list_is_singular(&pool->xsk_tx_list)) {
366 /* Fallback to the non-batched version */
367 rcu_read_unlock();
368 return xsk_tx_peek_release_fallback(pool, descs, max_entries);
369 }
370
371 xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
372 if (!xs) {
373 nb_pkts = 0;
374 goto out;
375 }
376
377 nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
378 if (!nb_pkts) {
379 xs->tx->queue_empty_descs++;
380 goto out;
381 }
382
383 /* This is the backpressure mechanism for the Tx path. Try to
384 * reserve space in the completion queue for all packets, but
385 * if there are fewer slots available, just process that many
386 * packets. This avoids having to implement any buffering in
387 * the Tx path.
388 */
389 nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
390 if (!nb_pkts)
391 goto out;
392
393 xskq_cons_release_n(xs->tx, nb_pkts);
394 __xskq_cons_release(xs->tx);
395 xs->sk.sk_write_space(&xs->sk);
396
397 out:
398 rcu_read_unlock();
399 return nb_pkts;
400 }
401 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
402
xsk_wakeup(struct xdp_sock * xs,u8 flags)403 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
404 {
405 struct net_device *dev = xs->dev;
406 int err;
407
408 rcu_read_lock();
409 err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
410 rcu_read_unlock();
411
412 return err;
413 }
414
xsk_zc_xmit(struct xdp_sock * xs)415 static int xsk_zc_xmit(struct xdp_sock *xs)
416 {
417 return xsk_wakeup(xs, XDP_WAKEUP_TX);
418 }
419
xsk_destruct_skb(struct sk_buff * skb)420 static void xsk_destruct_skb(struct sk_buff *skb)
421 {
422 u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
423 struct xdp_sock *xs = xdp_sk(skb->sk);
424 unsigned long flags;
425
426 spin_lock_irqsave(&xs->pool->cq_lock, flags);
427 xskq_prod_submit_addr(xs->pool->cq, addr);
428 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
429
430 sock_wfree(skb);
431 }
432
xsk_build_skb_zerocopy(struct xdp_sock * xs,struct xdp_desc * desc)433 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
434 struct xdp_desc *desc)
435 {
436 struct xsk_buff_pool *pool = xs->pool;
437 u32 hr, len, ts, offset, copy, copied;
438 struct sk_buff *skb;
439 struct page *page;
440 void *buffer;
441 int err, i;
442 u64 addr;
443
444 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
445
446 skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
447 if (unlikely(!skb))
448 return ERR_PTR(err);
449
450 skb_reserve(skb, hr);
451
452 addr = desc->addr;
453 len = desc->len;
454 ts = pool->unaligned ? len : pool->chunk_size;
455
456 buffer = xsk_buff_raw_get_data(pool, addr);
457 offset = offset_in_page(buffer);
458 addr = buffer - pool->addrs;
459
460 for (copied = 0, i = 0; copied < len; i++) {
461 page = pool->umem->pgs[addr >> PAGE_SHIFT];
462 get_page(page);
463
464 copy = min_t(u32, PAGE_SIZE - offset, len - copied);
465 skb_fill_page_desc(skb, i, page, offset, copy);
466
467 copied += copy;
468 addr += copy;
469 offset = 0;
470 }
471
472 skb->len += len;
473 skb->data_len += len;
474 skb->truesize += ts;
475
476 refcount_add(ts, &xs->sk.sk_wmem_alloc);
477
478 return skb;
479 }
480
xsk_build_skb(struct xdp_sock * xs,struct xdp_desc * desc)481 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
482 struct xdp_desc *desc)
483 {
484 struct net_device *dev = xs->dev;
485 struct sk_buff *skb;
486
487 if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
488 skb = xsk_build_skb_zerocopy(xs, desc);
489 if (IS_ERR(skb))
490 return skb;
491 } else {
492 u32 hr, tr, len;
493 void *buffer;
494 int err;
495
496 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
497 tr = dev->needed_tailroom;
498 len = desc->len;
499
500 skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
501 if (unlikely(!skb))
502 return ERR_PTR(err);
503
504 skb_reserve(skb, hr);
505 skb_put(skb, len);
506
507 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
508 err = skb_store_bits(skb, 0, buffer, len);
509 if (unlikely(err)) {
510 kfree_skb(skb);
511 return ERR_PTR(err);
512 }
513 }
514
515 skb->dev = dev;
516 skb->priority = xs->sk.sk_priority;
517 skb->mark = xs->sk.sk_mark;
518 skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
519 skb->destructor = xsk_destruct_skb;
520
521 return skb;
522 }
523
xsk_generic_xmit(struct sock * sk)524 static int xsk_generic_xmit(struct sock *sk)
525 {
526 struct xdp_sock *xs = xdp_sk(sk);
527 u32 max_batch = TX_BATCH_SIZE;
528 bool sent_frame = false;
529 struct xdp_desc desc;
530 struct sk_buff *skb;
531 unsigned long flags;
532 int err = 0;
533
534 mutex_lock(&xs->mutex);
535
536 if (xs->queue_id >= xs->dev->real_num_tx_queues)
537 goto out;
538
539 while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
540 if (max_batch-- == 0) {
541 err = -EAGAIN;
542 goto out;
543 }
544
545 skb = xsk_build_skb(xs, &desc);
546 if (IS_ERR(skb)) {
547 err = PTR_ERR(skb);
548 goto out;
549 }
550
551 /* This is the backpressure mechanism for the Tx path.
552 * Reserve space in the completion queue and only proceed
553 * if there is space in it. This avoids having to implement
554 * any buffering in the Tx path.
555 */
556 spin_lock_irqsave(&xs->pool->cq_lock, flags);
557 if (xskq_prod_reserve(xs->pool->cq)) {
558 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
559 kfree_skb(skb);
560 goto out;
561 }
562 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
563
564 err = __dev_direct_xmit(skb, xs->queue_id);
565 if (err == NETDEV_TX_BUSY) {
566 /* Tell user-space to retry the send */
567 skb->destructor = sock_wfree;
568 spin_lock_irqsave(&xs->pool->cq_lock, flags);
569 xskq_prod_cancel(xs->pool->cq);
570 spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
571 /* Free skb without triggering the perf drop trace */
572 consume_skb(skb);
573 err = -EAGAIN;
574 goto out;
575 }
576
577 xskq_cons_release(xs->tx);
578 /* Ignore NET_XMIT_CN as packet might have been sent */
579 if (err == NET_XMIT_DROP) {
580 /* SKB completed but not sent */
581 err = -EBUSY;
582 goto out;
583 }
584
585 sent_frame = true;
586 }
587
588 xs->tx->queue_empty_descs++;
589
590 out:
591 if (sent_frame)
592 if (xsk_tx_writeable(xs))
593 sk->sk_write_space(sk);
594
595 mutex_unlock(&xs->mutex);
596 return err;
597 }
598
__xsk_sendmsg(struct sock * sk)599 static int __xsk_sendmsg(struct sock *sk)
600 {
601 struct xdp_sock *xs = xdp_sk(sk);
602
603 if (unlikely(!(xs->dev->flags & IFF_UP)))
604 return -ENETDOWN;
605 if (unlikely(!xs->tx))
606 return -ENOBUFS;
607
608 return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
609 }
610
xsk_no_wakeup(struct sock * sk)611 static bool xsk_no_wakeup(struct sock *sk)
612 {
613 #ifdef CONFIG_NET_RX_BUSY_POLL
614 /* Prefer busy-polling, skip the wakeup. */
615 return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
616 READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
617 #else
618 return false;
619 #endif
620 }
621
xsk_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)622 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
623 {
624 bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
625 struct sock *sk = sock->sk;
626 struct xdp_sock *xs = xdp_sk(sk);
627 struct xsk_buff_pool *pool;
628
629 if (unlikely(!xsk_is_bound(xs)))
630 return -ENXIO;
631 if (unlikely(need_wait))
632 return -EOPNOTSUPP;
633
634 if (sk_can_busy_loop(sk))
635 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
636
637 if (xsk_no_wakeup(sk))
638 return 0;
639
640 pool = xs->pool;
641 if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
642 return __xsk_sendmsg(sk);
643 return 0;
644 }
645
xsk_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)646 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
647 {
648 bool need_wait = !(flags & MSG_DONTWAIT);
649 struct sock *sk = sock->sk;
650 struct xdp_sock *xs = xdp_sk(sk);
651
652 if (unlikely(!xsk_is_bound(xs)))
653 return -ENXIO;
654 if (unlikely(!(xs->dev->flags & IFF_UP)))
655 return -ENETDOWN;
656 if (unlikely(!xs->rx))
657 return -ENOBUFS;
658 if (unlikely(need_wait))
659 return -EOPNOTSUPP;
660
661 if (sk_can_busy_loop(sk))
662 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
663
664 if (xsk_no_wakeup(sk))
665 return 0;
666
667 if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
668 return xsk_wakeup(xs, XDP_WAKEUP_RX);
669 return 0;
670 }
671
xsk_poll(struct file * file,struct socket * sock,struct poll_table_struct * wait)672 static __poll_t xsk_poll(struct file *file, struct socket *sock,
673 struct poll_table_struct *wait)
674 {
675 __poll_t mask = 0;
676 struct sock *sk = sock->sk;
677 struct xdp_sock *xs = xdp_sk(sk);
678 struct xsk_buff_pool *pool;
679
680 sock_poll_wait(file, sock, wait);
681
682 if (unlikely(!xsk_is_bound(xs)))
683 return mask;
684
685 pool = xs->pool;
686
687 if (pool->cached_need_wakeup) {
688 if (xs->zc)
689 xsk_wakeup(xs, pool->cached_need_wakeup);
690 else
691 /* Poll needs to drive Tx also in copy mode */
692 __xsk_sendmsg(sk);
693 }
694
695 if (xs->rx && !xskq_prod_is_empty(xs->rx))
696 mask |= EPOLLIN | EPOLLRDNORM;
697 if (xs->tx && xsk_tx_writeable(xs))
698 mask |= EPOLLOUT | EPOLLWRNORM;
699
700 return mask;
701 }
702
xsk_init_queue(u32 entries,struct xsk_queue ** queue,bool umem_queue)703 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
704 bool umem_queue)
705 {
706 struct xsk_queue *q;
707
708 if (entries == 0 || *queue || !is_power_of_2(entries))
709 return -EINVAL;
710
711 q = xskq_create(entries, umem_queue);
712 if (!q)
713 return -ENOMEM;
714
715 /* Make sure queue is ready before it can be seen by others */
716 smp_wmb();
717 WRITE_ONCE(*queue, q);
718 return 0;
719 }
720
xsk_unbind_dev(struct xdp_sock * xs)721 static void xsk_unbind_dev(struct xdp_sock *xs)
722 {
723 struct net_device *dev = xs->dev;
724
725 if (xs->state != XSK_BOUND)
726 return;
727 WRITE_ONCE(xs->state, XSK_UNBOUND);
728
729 /* Wait for driver to stop using the xdp socket. */
730 xp_del_xsk(xs->pool, xs);
731 xs->dev = NULL;
732 synchronize_net();
733 dev_put(dev);
734 }
735
xsk_get_map_list_entry(struct xdp_sock * xs,struct xdp_sock __rcu *** map_entry)736 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
737 struct xdp_sock __rcu ***map_entry)
738 {
739 struct xsk_map *map = NULL;
740 struct xsk_map_node *node;
741
742 *map_entry = NULL;
743
744 spin_lock_bh(&xs->map_list_lock);
745 node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
746 node);
747 if (node) {
748 bpf_map_inc(&node->map->map);
749 map = node->map;
750 *map_entry = node->map_entry;
751 }
752 spin_unlock_bh(&xs->map_list_lock);
753 return map;
754 }
755
xsk_delete_from_maps(struct xdp_sock * xs)756 static void xsk_delete_from_maps(struct xdp_sock *xs)
757 {
758 /* This function removes the current XDP socket from all the
759 * maps it resides in. We need to take extra care here, due to
760 * the two locks involved. Each map has a lock synchronizing
761 * updates to the entries, and each socket has a lock that
762 * synchronizes access to the list of maps (map_list). For
763 * deadlock avoidance the locks need to be taken in the order
764 * "map lock"->"socket map list lock". We start off by
765 * accessing the socket map list, and take a reference to the
766 * map to guarantee existence between the
767 * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
768 * calls. Then we ask the map to remove the socket, which
769 * tries to remove the socket from the map. Note that there
770 * might be updates to the map between
771 * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
772 */
773 struct xdp_sock __rcu **map_entry = NULL;
774 struct xsk_map *map;
775
776 while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
777 xsk_map_try_sock_delete(map, xs, map_entry);
778 bpf_map_put(&map->map);
779 }
780 }
781
xsk_release(struct socket * sock)782 static int xsk_release(struct socket *sock)
783 {
784 struct sock *sk = sock->sk;
785 struct xdp_sock *xs = xdp_sk(sk);
786 struct net *net;
787
788 if (!sk)
789 return 0;
790
791 net = sock_net(sk);
792
793 mutex_lock(&net->xdp.lock);
794 sk_del_node_init_rcu(sk);
795 mutex_unlock(&net->xdp.lock);
796
797 local_bh_disable();
798 sock_prot_inuse_add(net, sk->sk_prot, -1);
799 local_bh_enable();
800
801 xsk_delete_from_maps(xs);
802 mutex_lock(&xs->mutex);
803 xsk_unbind_dev(xs);
804 mutex_unlock(&xs->mutex);
805
806 xskq_destroy(xs->rx);
807 xskq_destroy(xs->tx);
808 xskq_destroy(xs->fq_tmp);
809 xskq_destroy(xs->cq_tmp);
810
811 sock_orphan(sk);
812 sock->sk = NULL;
813
814 sk_refcnt_debug_release(sk);
815 sock_put(sk);
816
817 return 0;
818 }
819
xsk_lookup_xsk_from_fd(int fd)820 static struct socket *xsk_lookup_xsk_from_fd(int fd)
821 {
822 struct socket *sock;
823 int err;
824
825 sock = sockfd_lookup(fd, &err);
826 if (!sock)
827 return ERR_PTR(-ENOTSOCK);
828
829 if (sock->sk->sk_family != PF_XDP) {
830 sockfd_put(sock);
831 return ERR_PTR(-ENOPROTOOPT);
832 }
833
834 return sock;
835 }
836
xsk_validate_queues(struct xdp_sock * xs)837 static bool xsk_validate_queues(struct xdp_sock *xs)
838 {
839 return xs->fq_tmp && xs->cq_tmp;
840 }
841
xsk_bind(struct socket * sock,struct sockaddr * addr,int addr_len)842 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
843 {
844 struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
845 struct sock *sk = sock->sk;
846 struct xdp_sock *xs = xdp_sk(sk);
847 struct net_device *dev;
848 u32 flags, qid;
849 int err = 0;
850
851 if (addr_len < sizeof(struct sockaddr_xdp))
852 return -EINVAL;
853 if (sxdp->sxdp_family != AF_XDP)
854 return -EINVAL;
855
856 flags = sxdp->sxdp_flags;
857 if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
858 XDP_USE_NEED_WAKEUP))
859 return -EINVAL;
860
861 rtnl_lock();
862 mutex_lock(&xs->mutex);
863 if (xs->state != XSK_READY) {
864 err = -EBUSY;
865 goto out_release;
866 }
867
868 dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
869 if (!dev) {
870 err = -ENODEV;
871 goto out_release;
872 }
873
874 if (!xs->rx && !xs->tx) {
875 err = -EINVAL;
876 goto out_unlock;
877 }
878
879 qid = sxdp->sxdp_queue_id;
880
881 if (flags & XDP_SHARED_UMEM) {
882 struct xdp_sock *umem_xs;
883 struct socket *sock;
884
885 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
886 (flags & XDP_USE_NEED_WAKEUP)) {
887 /* Cannot specify flags for shared sockets. */
888 err = -EINVAL;
889 goto out_unlock;
890 }
891
892 if (xs->umem) {
893 /* We have already our own. */
894 err = -EINVAL;
895 goto out_unlock;
896 }
897
898 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
899 if (IS_ERR(sock)) {
900 err = PTR_ERR(sock);
901 goto out_unlock;
902 }
903
904 umem_xs = xdp_sk(sock->sk);
905 if (!xsk_is_bound(umem_xs)) {
906 err = -EBADF;
907 sockfd_put(sock);
908 goto out_unlock;
909 }
910
911 if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
912 /* Share the umem with another socket on another qid
913 * and/or device.
914 */
915 xs->pool = xp_create_and_assign_umem(xs,
916 umem_xs->umem);
917 if (!xs->pool) {
918 err = -ENOMEM;
919 sockfd_put(sock);
920 goto out_unlock;
921 }
922
923 err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
924 dev, qid);
925 if (err) {
926 xp_destroy(xs->pool);
927 xs->pool = NULL;
928 sockfd_put(sock);
929 goto out_unlock;
930 }
931 } else {
932 /* Share the buffer pool with the other socket. */
933 if (xs->fq_tmp || xs->cq_tmp) {
934 /* Do not allow setting your own fq or cq. */
935 err = -EINVAL;
936 sockfd_put(sock);
937 goto out_unlock;
938 }
939
940 xp_get_pool(umem_xs->pool);
941 xs->pool = umem_xs->pool;
942 }
943
944 xdp_get_umem(umem_xs->umem);
945 WRITE_ONCE(xs->umem, umem_xs->umem);
946 sockfd_put(sock);
947 } else if (!xs->umem || !xsk_validate_queues(xs)) {
948 err = -EINVAL;
949 goto out_unlock;
950 } else {
951 /* This xsk has its own umem. */
952 xs->pool = xp_create_and_assign_umem(xs, xs->umem);
953 if (!xs->pool) {
954 err = -ENOMEM;
955 goto out_unlock;
956 }
957
958 err = xp_assign_dev(xs->pool, dev, qid, flags);
959 if (err) {
960 xp_destroy(xs->pool);
961 xs->pool = NULL;
962 goto out_unlock;
963 }
964 }
965
966 /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
967 xs->fq_tmp = NULL;
968 xs->cq_tmp = NULL;
969
970 xs->dev = dev;
971 xs->zc = xs->umem->zc;
972 xs->queue_id = qid;
973 xp_add_xsk(xs->pool, xs);
974
975 out_unlock:
976 if (err) {
977 dev_put(dev);
978 } else {
979 /* Matches smp_rmb() in bind() for shared umem
980 * sockets, and xsk_is_bound().
981 */
982 smp_wmb();
983 WRITE_ONCE(xs->state, XSK_BOUND);
984 }
985 out_release:
986 mutex_unlock(&xs->mutex);
987 rtnl_unlock();
988 return err;
989 }
990
991 struct xdp_umem_reg_v1 {
992 __u64 addr; /* Start of packet data area */
993 __u64 len; /* Length of packet data area */
994 __u32 chunk_size;
995 __u32 headroom;
996 };
997
xsk_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)998 static int xsk_setsockopt(struct socket *sock, int level, int optname,
999 sockptr_t optval, unsigned int optlen)
1000 {
1001 struct sock *sk = sock->sk;
1002 struct xdp_sock *xs = xdp_sk(sk);
1003 int err;
1004
1005 if (level != SOL_XDP)
1006 return -ENOPROTOOPT;
1007
1008 switch (optname) {
1009 case XDP_RX_RING:
1010 case XDP_TX_RING:
1011 {
1012 struct xsk_queue **q;
1013 int entries;
1014
1015 if (optlen < sizeof(entries))
1016 return -EINVAL;
1017 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1018 return -EFAULT;
1019
1020 mutex_lock(&xs->mutex);
1021 if (xs->state != XSK_READY) {
1022 mutex_unlock(&xs->mutex);
1023 return -EBUSY;
1024 }
1025 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1026 err = xsk_init_queue(entries, q, false);
1027 if (!err && optname == XDP_TX_RING)
1028 /* Tx needs to be explicitly woken up the first time */
1029 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1030 mutex_unlock(&xs->mutex);
1031 return err;
1032 }
1033 case XDP_UMEM_REG:
1034 {
1035 size_t mr_size = sizeof(struct xdp_umem_reg);
1036 struct xdp_umem_reg mr = {};
1037 struct xdp_umem *umem;
1038
1039 if (optlen < sizeof(struct xdp_umem_reg_v1))
1040 return -EINVAL;
1041 else if (optlen < sizeof(mr))
1042 mr_size = sizeof(struct xdp_umem_reg_v1);
1043
1044 if (copy_from_sockptr(&mr, optval, mr_size))
1045 return -EFAULT;
1046
1047 mutex_lock(&xs->mutex);
1048 if (xs->state != XSK_READY || xs->umem) {
1049 mutex_unlock(&xs->mutex);
1050 return -EBUSY;
1051 }
1052
1053 umem = xdp_umem_create(&mr);
1054 if (IS_ERR(umem)) {
1055 mutex_unlock(&xs->mutex);
1056 return PTR_ERR(umem);
1057 }
1058
1059 /* Make sure umem is ready before it can be seen by others */
1060 smp_wmb();
1061 WRITE_ONCE(xs->umem, umem);
1062 mutex_unlock(&xs->mutex);
1063 return 0;
1064 }
1065 case XDP_UMEM_FILL_RING:
1066 case XDP_UMEM_COMPLETION_RING:
1067 {
1068 struct xsk_queue **q;
1069 int entries;
1070
1071 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1072 return -EFAULT;
1073
1074 mutex_lock(&xs->mutex);
1075 if (xs->state != XSK_READY) {
1076 mutex_unlock(&xs->mutex);
1077 return -EBUSY;
1078 }
1079
1080 q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1081 &xs->cq_tmp;
1082 err = xsk_init_queue(entries, q, true);
1083 mutex_unlock(&xs->mutex);
1084 return err;
1085 }
1086 default:
1087 break;
1088 }
1089
1090 return -ENOPROTOOPT;
1091 }
1092
xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 * ring)1093 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1094 {
1095 ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1096 ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1097 ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1098 }
1099
xsk_enter_umem_offsets(struct xdp_ring_offset_v1 * ring)1100 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1101 {
1102 ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1103 ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1104 ring->desc = offsetof(struct xdp_umem_ring, desc);
1105 }
1106
1107 struct xdp_statistics_v1 {
1108 __u64 rx_dropped;
1109 __u64 rx_invalid_descs;
1110 __u64 tx_invalid_descs;
1111 };
1112
xsk_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)1113 static int xsk_getsockopt(struct socket *sock, int level, int optname,
1114 char __user *optval, int __user *optlen)
1115 {
1116 struct sock *sk = sock->sk;
1117 struct xdp_sock *xs = xdp_sk(sk);
1118 int len;
1119
1120 if (level != SOL_XDP)
1121 return -ENOPROTOOPT;
1122
1123 if (get_user(len, optlen))
1124 return -EFAULT;
1125 if (len < 0)
1126 return -EINVAL;
1127
1128 switch (optname) {
1129 case XDP_STATISTICS:
1130 {
1131 struct xdp_statistics stats = {};
1132 bool extra_stats = true;
1133 size_t stats_size;
1134
1135 if (len < sizeof(struct xdp_statistics_v1)) {
1136 return -EINVAL;
1137 } else if (len < sizeof(stats)) {
1138 extra_stats = false;
1139 stats_size = sizeof(struct xdp_statistics_v1);
1140 } else {
1141 stats_size = sizeof(stats);
1142 }
1143
1144 mutex_lock(&xs->mutex);
1145 stats.rx_dropped = xs->rx_dropped;
1146 if (extra_stats) {
1147 stats.rx_ring_full = xs->rx_queue_full;
1148 stats.rx_fill_ring_empty_descs =
1149 xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1150 stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1151 } else {
1152 stats.rx_dropped += xs->rx_queue_full;
1153 }
1154 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1155 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1156 mutex_unlock(&xs->mutex);
1157
1158 if (copy_to_user(optval, &stats, stats_size))
1159 return -EFAULT;
1160 if (put_user(stats_size, optlen))
1161 return -EFAULT;
1162
1163 return 0;
1164 }
1165 case XDP_MMAP_OFFSETS:
1166 {
1167 struct xdp_mmap_offsets off;
1168 struct xdp_mmap_offsets_v1 off_v1;
1169 bool flags_supported = true;
1170 void *to_copy;
1171
1172 if (len < sizeof(off_v1))
1173 return -EINVAL;
1174 else if (len < sizeof(off))
1175 flags_supported = false;
1176
1177 if (flags_supported) {
1178 /* xdp_ring_offset is identical to xdp_ring_offset_v1
1179 * except for the flags field added to the end.
1180 */
1181 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1182 &off.rx);
1183 xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1184 &off.tx);
1185 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1186 &off.fr);
1187 xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1188 &off.cr);
1189 off.rx.flags = offsetof(struct xdp_rxtx_ring,
1190 ptrs.flags);
1191 off.tx.flags = offsetof(struct xdp_rxtx_ring,
1192 ptrs.flags);
1193 off.fr.flags = offsetof(struct xdp_umem_ring,
1194 ptrs.flags);
1195 off.cr.flags = offsetof(struct xdp_umem_ring,
1196 ptrs.flags);
1197
1198 len = sizeof(off);
1199 to_copy = &off;
1200 } else {
1201 xsk_enter_rxtx_offsets(&off_v1.rx);
1202 xsk_enter_rxtx_offsets(&off_v1.tx);
1203 xsk_enter_umem_offsets(&off_v1.fr);
1204 xsk_enter_umem_offsets(&off_v1.cr);
1205
1206 len = sizeof(off_v1);
1207 to_copy = &off_v1;
1208 }
1209
1210 if (copy_to_user(optval, to_copy, len))
1211 return -EFAULT;
1212 if (put_user(len, optlen))
1213 return -EFAULT;
1214
1215 return 0;
1216 }
1217 case XDP_OPTIONS:
1218 {
1219 struct xdp_options opts = {};
1220
1221 if (len < sizeof(opts))
1222 return -EINVAL;
1223
1224 mutex_lock(&xs->mutex);
1225 if (xs->zc)
1226 opts.flags |= XDP_OPTIONS_ZEROCOPY;
1227 mutex_unlock(&xs->mutex);
1228
1229 len = sizeof(opts);
1230 if (copy_to_user(optval, &opts, len))
1231 return -EFAULT;
1232 if (put_user(len, optlen))
1233 return -EFAULT;
1234
1235 return 0;
1236 }
1237 default:
1238 break;
1239 }
1240
1241 return -EOPNOTSUPP;
1242 }
1243
xsk_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)1244 static int xsk_mmap(struct file *file, struct socket *sock,
1245 struct vm_area_struct *vma)
1246 {
1247 loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1248 unsigned long size = vma->vm_end - vma->vm_start;
1249 struct xdp_sock *xs = xdp_sk(sock->sk);
1250 struct xsk_queue *q = NULL;
1251 unsigned long pfn;
1252 struct page *qpg;
1253
1254 if (READ_ONCE(xs->state) != XSK_READY)
1255 return -EBUSY;
1256
1257 if (offset == XDP_PGOFF_RX_RING) {
1258 q = READ_ONCE(xs->rx);
1259 } else if (offset == XDP_PGOFF_TX_RING) {
1260 q = READ_ONCE(xs->tx);
1261 } else {
1262 /* Matches the smp_wmb() in XDP_UMEM_REG */
1263 smp_rmb();
1264 if (offset == XDP_UMEM_PGOFF_FILL_RING)
1265 q = READ_ONCE(xs->fq_tmp);
1266 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1267 q = READ_ONCE(xs->cq_tmp);
1268 }
1269
1270 if (!q)
1271 return -EINVAL;
1272
1273 /* Matches the smp_wmb() in xsk_init_queue */
1274 smp_rmb();
1275 qpg = virt_to_head_page(q->ring);
1276 if (size > page_size(qpg))
1277 return -EINVAL;
1278
1279 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1280 return remap_pfn_range(vma, vma->vm_start, pfn,
1281 size, vma->vm_page_prot);
1282 }
1283
xsk_notifier(struct notifier_block * this,unsigned long msg,void * ptr)1284 static int xsk_notifier(struct notifier_block *this,
1285 unsigned long msg, void *ptr)
1286 {
1287 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1288 struct net *net = dev_net(dev);
1289 struct sock *sk;
1290
1291 switch (msg) {
1292 case NETDEV_UNREGISTER:
1293 mutex_lock(&net->xdp.lock);
1294 sk_for_each(sk, &net->xdp.list) {
1295 struct xdp_sock *xs = xdp_sk(sk);
1296
1297 mutex_lock(&xs->mutex);
1298 if (xs->dev == dev) {
1299 sk->sk_err = ENETDOWN;
1300 if (!sock_flag(sk, SOCK_DEAD))
1301 sk_error_report(sk);
1302
1303 xsk_unbind_dev(xs);
1304
1305 /* Clear device references. */
1306 xp_clear_dev(xs->pool);
1307 }
1308 mutex_unlock(&xs->mutex);
1309 }
1310 mutex_unlock(&net->xdp.lock);
1311 break;
1312 }
1313 return NOTIFY_DONE;
1314 }
1315
1316 static struct proto xsk_proto = {
1317 .name = "XDP",
1318 .owner = THIS_MODULE,
1319 .obj_size = sizeof(struct xdp_sock),
1320 };
1321
1322 static const struct proto_ops xsk_proto_ops = {
1323 .family = PF_XDP,
1324 .owner = THIS_MODULE,
1325 .release = xsk_release,
1326 .bind = xsk_bind,
1327 .connect = sock_no_connect,
1328 .socketpair = sock_no_socketpair,
1329 .accept = sock_no_accept,
1330 .getname = sock_no_getname,
1331 .poll = xsk_poll,
1332 .ioctl = sock_no_ioctl,
1333 .listen = sock_no_listen,
1334 .shutdown = sock_no_shutdown,
1335 .setsockopt = xsk_setsockopt,
1336 .getsockopt = xsk_getsockopt,
1337 .sendmsg = xsk_sendmsg,
1338 .recvmsg = xsk_recvmsg,
1339 .mmap = xsk_mmap,
1340 .sendpage = sock_no_sendpage,
1341 };
1342
xsk_destruct(struct sock * sk)1343 static void xsk_destruct(struct sock *sk)
1344 {
1345 struct xdp_sock *xs = xdp_sk(sk);
1346
1347 if (!sock_flag(sk, SOCK_DEAD))
1348 return;
1349
1350 if (!xp_put_pool(xs->pool))
1351 xdp_put_umem(xs->umem, !xs->pool);
1352
1353 sk_refcnt_debug_dec(sk);
1354 }
1355
xsk_create(struct net * net,struct socket * sock,int protocol,int kern)1356 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1357 int kern)
1358 {
1359 struct xdp_sock *xs;
1360 struct sock *sk;
1361
1362 if (!ns_capable(net->user_ns, CAP_NET_RAW))
1363 return -EPERM;
1364 if (sock->type != SOCK_RAW)
1365 return -ESOCKTNOSUPPORT;
1366
1367 if (protocol)
1368 return -EPROTONOSUPPORT;
1369
1370 sock->state = SS_UNCONNECTED;
1371
1372 sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1373 if (!sk)
1374 return -ENOBUFS;
1375
1376 sock->ops = &xsk_proto_ops;
1377
1378 sock_init_data(sock, sk);
1379
1380 sk->sk_family = PF_XDP;
1381
1382 sk->sk_destruct = xsk_destruct;
1383 sk_refcnt_debug_inc(sk);
1384
1385 sock_set_flag(sk, SOCK_RCU_FREE);
1386
1387 xs = xdp_sk(sk);
1388 xs->state = XSK_READY;
1389 mutex_init(&xs->mutex);
1390 spin_lock_init(&xs->rx_lock);
1391
1392 INIT_LIST_HEAD(&xs->map_list);
1393 spin_lock_init(&xs->map_list_lock);
1394
1395 mutex_lock(&net->xdp.lock);
1396 sk_add_node_rcu(sk, &net->xdp.list);
1397 mutex_unlock(&net->xdp.lock);
1398
1399 local_bh_disable();
1400 sock_prot_inuse_add(net, &xsk_proto, 1);
1401 local_bh_enable();
1402
1403 return 0;
1404 }
1405
1406 static const struct net_proto_family xsk_family_ops = {
1407 .family = PF_XDP,
1408 .create = xsk_create,
1409 .owner = THIS_MODULE,
1410 };
1411
1412 static struct notifier_block xsk_netdev_notifier = {
1413 .notifier_call = xsk_notifier,
1414 };
1415
xsk_net_init(struct net * net)1416 static int __net_init xsk_net_init(struct net *net)
1417 {
1418 mutex_init(&net->xdp.lock);
1419 INIT_HLIST_HEAD(&net->xdp.list);
1420 return 0;
1421 }
1422
xsk_net_exit(struct net * net)1423 static void __net_exit xsk_net_exit(struct net *net)
1424 {
1425 WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1426 }
1427
1428 static struct pernet_operations xsk_net_ops = {
1429 .init = xsk_net_init,
1430 .exit = xsk_net_exit,
1431 };
1432
xsk_init(void)1433 static int __init xsk_init(void)
1434 {
1435 int err, cpu;
1436
1437 err = proto_register(&xsk_proto, 0 /* no slab */);
1438 if (err)
1439 goto out;
1440
1441 err = sock_register(&xsk_family_ops);
1442 if (err)
1443 goto out_proto;
1444
1445 err = register_pernet_subsys(&xsk_net_ops);
1446 if (err)
1447 goto out_sk;
1448
1449 err = register_netdevice_notifier(&xsk_netdev_notifier);
1450 if (err)
1451 goto out_pernet;
1452
1453 for_each_possible_cpu(cpu)
1454 INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1455 return 0;
1456
1457 out_pernet:
1458 unregister_pernet_subsys(&xsk_net_ops);
1459 out_sk:
1460 sock_unregister(PF_XDP);
1461 out_proto:
1462 proto_unregister(&xsk_proto);
1463 out:
1464 return err;
1465 }
1466
1467 fs_initcall(xsk_init);
1468