1  /* SPDX-License-Identifier: GPL-2.0
2   *
3   * page_pool.c
4   *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
5   *	Copyright (C) 2016 Red Hat, Inc.
6   */
7  
8  #include <linux/types.h>
9  #include <linux/kernel.h>
10  #include <linux/slab.h>
11  #include <linux/device.h>
12  
13  #include <net/page_pool.h>
14  #include <net/xdp.h>
15  
16  #include <linux/dma-direction.h>
17  #include <linux/dma-mapping.h>
18  #include <linux/page-flags.h>
19  #include <linux/mm.h> /* for put_page() */
20  #include <linux/poison.h>
21  #include <linux/ethtool.h>
22  
23  #include <trace/events/page_pool.h>
24  
25  #define DEFER_TIME (msecs_to_jiffies(1000))
26  #define DEFER_WARN_INTERVAL (60 * HZ)
27  
28  #define BIAS_MAX	LONG_MAX
29  
30  #ifdef CONFIG_PAGE_POOL_STATS
31  /* alloc_stat_inc is intended to be used in softirq context */
32  #define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
33  /* recycle_stat_inc is safe to use when preemption is possible. */
34  #define recycle_stat_inc(pool, __stat)							\
35  	do {										\
36  		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
37  		this_cpu_inc(s->__stat);						\
38  	} while (0)
39  
40  #define recycle_stat_add(pool, __stat, val)						\
41  	do {										\
42  		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
43  		this_cpu_add(s->__stat, val);						\
44  	} while (0)
45  
46  static const char pp_stats[][ETH_GSTRING_LEN] = {
47  	"rx_pp_alloc_fast",
48  	"rx_pp_alloc_slow",
49  	"rx_pp_alloc_slow_ho",
50  	"rx_pp_alloc_empty",
51  	"rx_pp_alloc_refill",
52  	"rx_pp_alloc_waive",
53  	"rx_pp_recycle_cached",
54  	"rx_pp_recycle_cache_full",
55  	"rx_pp_recycle_ring",
56  	"rx_pp_recycle_ring_full",
57  	"rx_pp_recycle_released_ref",
58  };
59  
page_pool_get_stats(struct page_pool * pool,struct page_pool_stats * stats)60  bool page_pool_get_stats(struct page_pool *pool,
61  			 struct page_pool_stats *stats)
62  {
63  	int cpu = 0;
64  
65  	if (!stats)
66  		return false;
67  
68  	/* The caller is responsible to initialize stats. */
69  	stats->alloc_stats.fast += pool->alloc_stats.fast;
70  	stats->alloc_stats.slow += pool->alloc_stats.slow;
71  	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
72  	stats->alloc_stats.empty += pool->alloc_stats.empty;
73  	stats->alloc_stats.refill += pool->alloc_stats.refill;
74  	stats->alloc_stats.waive += pool->alloc_stats.waive;
75  
76  	for_each_possible_cpu(cpu) {
77  		const struct page_pool_recycle_stats *pcpu =
78  			per_cpu_ptr(pool->recycle_stats, cpu);
79  
80  		stats->recycle_stats.cached += pcpu->cached;
81  		stats->recycle_stats.cache_full += pcpu->cache_full;
82  		stats->recycle_stats.ring += pcpu->ring;
83  		stats->recycle_stats.ring_full += pcpu->ring_full;
84  		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
85  	}
86  
87  	return true;
88  }
89  EXPORT_SYMBOL(page_pool_get_stats);
90  
page_pool_ethtool_stats_get_strings(u8 * data)91  u8 *page_pool_ethtool_stats_get_strings(u8 *data)
92  {
93  	int i;
94  
95  	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
96  		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
97  		data += ETH_GSTRING_LEN;
98  	}
99  
100  	return data;
101  }
102  EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
103  
page_pool_ethtool_stats_get_count(void)104  int page_pool_ethtool_stats_get_count(void)
105  {
106  	return ARRAY_SIZE(pp_stats);
107  }
108  EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
109  
page_pool_ethtool_stats_get(u64 * data,void * stats)110  u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
111  {
112  	struct page_pool_stats *pool_stats = stats;
113  
114  	*data++ = pool_stats->alloc_stats.fast;
115  	*data++ = pool_stats->alloc_stats.slow;
116  	*data++ = pool_stats->alloc_stats.slow_high_order;
117  	*data++ = pool_stats->alloc_stats.empty;
118  	*data++ = pool_stats->alloc_stats.refill;
119  	*data++ = pool_stats->alloc_stats.waive;
120  	*data++ = pool_stats->recycle_stats.cached;
121  	*data++ = pool_stats->recycle_stats.cache_full;
122  	*data++ = pool_stats->recycle_stats.ring;
123  	*data++ = pool_stats->recycle_stats.ring_full;
124  	*data++ = pool_stats->recycle_stats.released_refcnt;
125  
126  	return data;
127  }
128  EXPORT_SYMBOL(page_pool_ethtool_stats_get);
129  
130  #else
131  #define alloc_stat_inc(pool, __stat)
132  #define recycle_stat_inc(pool, __stat)
133  #define recycle_stat_add(pool, __stat, val)
134  #endif
135  
page_pool_init(struct page_pool * pool,const struct page_pool_params * params)136  static int page_pool_init(struct page_pool *pool,
137  			  const struct page_pool_params *params)
138  {
139  	unsigned int ring_qsize = 1024; /* Default */
140  
141  	memcpy(&pool->p, params, sizeof(pool->p));
142  
143  	/* Validate only known flags were used */
144  	if (pool->p.flags & ~(PP_FLAG_ALL))
145  		return -EINVAL;
146  
147  	if (pool->p.pool_size)
148  		ring_qsize = pool->p.pool_size;
149  
150  	/* Sanity limit mem that can be pinned down */
151  	if (ring_qsize > 32768)
152  		return -E2BIG;
153  
154  	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
155  	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
156  	 * which is the XDP_TX use-case.
157  	 */
158  	if (pool->p.flags & PP_FLAG_DMA_MAP) {
159  		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
160  		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
161  			return -EINVAL;
162  	}
163  
164  	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
165  		/* In order to request DMA-sync-for-device the page
166  		 * needs to be mapped
167  		 */
168  		if (!(pool->p.flags & PP_FLAG_DMA_MAP))
169  			return -EINVAL;
170  
171  		if (!pool->p.max_len)
172  			return -EINVAL;
173  
174  		/* pool->p.offset has to be set according to the address
175  		 * offset used by the DMA engine to start copying rx data
176  		 */
177  	}
178  
179  	if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
180  	    pool->p.flags & PP_FLAG_PAGE_FRAG)
181  		return -EINVAL;
182  
183  #ifdef CONFIG_PAGE_POOL_STATS
184  	pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
185  	if (!pool->recycle_stats)
186  		return -ENOMEM;
187  #endif
188  
189  	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
190  		return -ENOMEM;
191  
192  	atomic_set(&pool->pages_state_release_cnt, 0);
193  
194  	/* Driver calling page_pool_create() also call page_pool_destroy() */
195  	refcount_set(&pool->user_cnt, 1);
196  
197  	if (pool->p.flags & PP_FLAG_DMA_MAP)
198  		get_device(pool->p.dev);
199  
200  	return 0;
201  }
202  
page_pool_create(const struct page_pool_params * params)203  struct page_pool *page_pool_create(const struct page_pool_params *params)
204  {
205  	struct page_pool *pool;
206  	int err;
207  
208  	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
209  	if (!pool)
210  		return ERR_PTR(-ENOMEM);
211  
212  	err = page_pool_init(pool, params);
213  	if (err < 0) {
214  		pr_warn("%s() gave up with errno %d\n", __func__, err);
215  		kfree(pool);
216  		return ERR_PTR(err);
217  	}
218  
219  	return pool;
220  }
221  EXPORT_SYMBOL(page_pool_create);
222  
223  static void page_pool_return_page(struct page_pool *pool, struct page *page);
224  
225  noinline
page_pool_refill_alloc_cache(struct page_pool * pool)226  static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
227  {
228  	struct ptr_ring *r = &pool->ring;
229  	struct page *page;
230  	int pref_nid; /* preferred NUMA node */
231  
232  	/* Quicker fallback, avoid locks when ring is empty */
233  	if (__ptr_ring_empty(r)) {
234  		alloc_stat_inc(pool, empty);
235  		return NULL;
236  	}
237  
238  	/* Softirq guarantee CPU and thus NUMA node is stable. This,
239  	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
240  	 */
241  #ifdef CONFIG_NUMA
242  	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
243  #else
244  	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
245  	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
246  #endif
247  
248  	/* Refill alloc array, but only if NUMA match */
249  	do {
250  		page = __ptr_ring_consume(r);
251  		if (unlikely(!page))
252  			break;
253  
254  		if (likely(page_to_nid(page) == pref_nid)) {
255  			pool->alloc.cache[pool->alloc.count++] = page;
256  		} else {
257  			/* NUMA mismatch;
258  			 * (1) release 1 page to page-allocator and
259  			 * (2) break out to fallthrough to alloc_pages_node.
260  			 * This limit stress on page buddy alloactor.
261  			 */
262  			page_pool_return_page(pool, page);
263  			alloc_stat_inc(pool, waive);
264  			page = NULL;
265  			break;
266  		}
267  	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
268  
269  	/* Return last page */
270  	if (likely(pool->alloc.count > 0)) {
271  		page = pool->alloc.cache[--pool->alloc.count];
272  		alloc_stat_inc(pool, refill);
273  	}
274  
275  	return page;
276  }
277  
278  /* fast path */
__page_pool_get_cached(struct page_pool * pool)279  static struct page *__page_pool_get_cached(struct page_pool *pool)
280  {
281  	struct page *page;
282  
283  	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
284  	if (likely(pool->alloc.count)) {
285  		/* Fast-path */
286  		page = pool->alloc.cache[--pool->alloc.count];
287  		alloc_stat_inc(pool, fast);
288  	} else {
289  		page = page_pool_refill_alloc_cache(pool);
290  	}
291  
292  	return page;
293  }
294  
page_pool_dma_sync_for_device(struct page_pool * pool,struct page * page,unsigned int dma_sync_size)295  static void page_pool_dma_sync_for_device(struct page_pool *pool,
296  					  struct page *page,
297  					  unsigned int dma_sync_size)
298  {
299  	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
300  
301  	dma_sync_size = min(dma_sync_size, pool->p.max_len);
302  	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
303  					 pool->p.offset, dma_sync_size,
304  					 pool->p.dma_dir);
305  }
306  
page_pool_dma_map(struct page_pool * pool,struct page * page)307  static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
308  {
309  	dma_addr_t dma;
310  
311  	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
312  	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
313  	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
314  	 * This mapping is kept for lifetime of page, until leaving pool.
315  	 */
316  	dma = dma_map_page_attrs(pool->p.dev, page, 0,
317  				 (PAGE_SIZE << pool->p.order),
318  				 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
319  	if (dma_mapping_error(pool->p.dev, dma))
320  		return false;
321  
322  	page_pool_set_dma_addr(page, dma);
323  
324  	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
325  		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
326  
327  	return true;
328  }
329  
page_pool_set_pp_info(struct page_pool * pool,struct page * page)330  static void page_pool_set_pp_info(struct page_pool *pool,
331  				  struct page *page)
332  {
333  	page->pp = pool;
334  	page->pp_magic |= PP_SIGNATURE;
335  	if (pool->p.init_callback)
336  		pool->p.init_callback(page, pool->p.init_arg);
337  }
338  
page_pool_clear_pp_info(struct page * page)339  static void page_pool_clear_pp_info(struct page *page)
340  {
341  	page->pp_magic = 0;
342  	page->pp = NULL;
343  }
344  
__page_pool_alloc_page_order(struct page_pool * pool,gfp_t gfp)345  static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
346  						 gfp_t gfp)
347  {
348  	struct page *page;
349  
350  	gfp |= __GFP_COMP;
351  	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
352  	if (unlikely(!page))
353  		return NULL;
354  
355  	if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
356  	    unlikely(!page_pool_dma_map(pool, page))) {
357  		put_page(page);
358  		return NULL;
359  	}
360  
361  	alloc_stat_inc(pool, slow_high_order);
362  	page_pool_set_pp_info(pool, page);
363  
364  	/* Track how many pages are held 'in-flight' */
365  	pool->pages_state_hold_cnt++;
366  	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
367  	return page;
368  }
369  
370  /* slow path */
371  noinline
__page_pool_alloc_pages_slow(struct page_pool * pool,gfp_t gfp)372  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
373  						 gfp_t gfp)
374  {
375  	const int bulk = PP_ALLOC_CACHE_REFILL;
376  	unsigned int pp_flags = pool->p.flags;
377  	unsigned int pp_order = pool->p.order;
378  	struct page *page;
379  	int i, nr_pages;
380  
381  	/* Don't support bulk alloc for high-order pages */
382  	if (unlikely(pp_order))
383  		return __page_pool_alloc_page_order(pool, gfp);
384  
385  	/* Unnecessary as alloc cache is empty, but guarantees zero count */
386  	if (unlikely(pool->alloc.count > 0))
387  		return pool->alloc.cache[--pool->alloc.count];
388  
389  	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
390  	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
391  
392  	nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
393  					       pool->alloc.cache);
394  	if (unlikely(!nr_pages))
395  		return NULL;
396  
397  	/* Pages have been filled into alloc.cache array, but count is zero and
398  	 * page element have not been (possibly) DMA mapped.
399  	 */
400  	for (i = 0; i < nr_pages; i++) {
401  		page = pool->alloc.cache[i];
402  		if ((pp_flags & PP_FLAG_DMA_MAP) &&
403  		    unlikely(!page_pool_dma_map(pool, page))) {
404  			put_page(page);
405  			continue;
406  		}
407  
408  		page_pool_set_pp_info(pool, page);
409  		pool->alloc.cache[pool->alloc.count++] = page;
410  		/* Track how many pages are held 'in-flight' */
411  		pool->pages_state_hold_cnt++;
412  		trace_page_pool_state_hold(pool, page,
413  					   pool->pages_state_hold_cnt);
414  	}
415  
416  	/* Return last page */
417  	if (likely(pool->alloc.count > 0)) {
418  		page = pool->alloc.cache[--pool->alloc.count];
419  		alloc_stat_inc(pool, slow);
420  	} else {
421  		page = NULL;
422  	}
423  
424  	/* When page just alloc'ed is should/must have refcnt 1. */
425  	return page;
426  }
427  
428  /* For using page_pool replace: alloc_pages() API calls, but provide
429   * synchronization guarantee for allocation side.
430   */
page_pool_alloc_pages(struct page_pool * pool,gfp_t gfp)431  struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
432  {
433  	struct page *page;
434  
435  	/* Fast-path: Get a page from cache */
436  	page = __page_pool_get_cached(pool);
437  	if (page)
438  		return page;
439  
440  	/* Slow-path: cache empty, do real allocation */
441  	page = __page_pool_alloc_pages_slow(pool, gfp);
442  	return page;
443  }
444  EXPORT_SYMBOL(page_pool_alloc_pages);
445  
446  /* Calculate distance between two u32 values, valid if distance is below 2^(31)
447   *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
448   */
449  #define _distance(a, b)	(s32)((a) - (b))
450  
page_pool_inflight(struct page_pool * pool)451  static s32 page_pool_inflight(struct page_pool *pool)
452  {
453  	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
454  	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
455  	s32 inflight;
456  
457  	inflight = _distance(hold_cnt, release_cnt);
458  
459  	trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
460  	WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
461  
462  	return inflight;
463  }
464  
465  /* Disconnects a page (from a page_pool).  API users can have a need
466   * to disconnect a page (from a page_pool), to allow it to be used as
467   * a regular page (that will eventually be returned to the normal
468   * page-allocator via put_page).
469   */
page_pool_release_page(struct page_pool * pool,struct page * page)470  void page_pool_release_page(struct page_pool *pool, struct page *page)
471  {
472  	dma_addr_t dma;
473  	int count;
474  
475  	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
476  		/* Always account for inflight pages, even if we didn't
477  		 * map them
478  		 */
479  		goto skip_dma_unmap;
480  
481  	dma = page_pool_get_dma_addr(page);
482  
483  	/* When page is unmapped, it cannot be returned to our pool */
484  	dma_unmap_page_attrs(pool->p.dev, dma,
485  			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
486  			     DMA_ATTR_SKIP_CPU_SYNC);
487  	page_pool_set_dma_addr(page, 0);
488  skip_dma_unmap:
489  	page_pool_clear_pp_info(page);
490  
491  	/* This may be the last page returned, releasing the pool, so
492  	 * it is not safe to reference pool afterwards.
493  	 */
494  	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
495  	trace_page_pool_state_release(pool, page, count);
496  }
497  EXPORT_SYMBOL(page_pool_release_page);
498  
499  /* Return a page to the page allocator, cleaning up our state */
page_pool_return_page(struct page_pool * pool,struct page * page)500  static void page_pool_return_page(struct page_pool *pool, struct page *page)
501  {
502  	page_pool_release_page(pool, page);
503  
504  	put_page(page);
505  	/* An optimization would be to call __free_pages(page, pool->p.order)
506  	 * knowing page is not part of page-cache (thus avoiding a
507  	 * __page_cache_release() call).
508  	 */
509  }
510  
page_pool_recycle_in_ring(struct page_pool * pool,struct page * page)511  static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
512  {
513  	int ret;
514  	/* BH protection not needed if current is softirq */
515  	if (in_softirq())
516  		ret = ptr_ring_produce(&pool->ring, page);
517  	else
518  		ret = ptr_ring_produce_bh(&pool->ring, page);
519  
520  	if (!ret) {
521  		recycle_stat_inc(pool, ring);
522  		return true;
523  	}
524  
525  	return false;
526  }
527  
528  /* Only allow direct recycling in special circumstances, into the
529   * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
530   *
531   * Caller must provide appropriate safe context.
532   */
page_pool_recycle_in_cache(struct page * page,struct page_pool * pool)533  static bool page_pool_recycle_in_cache(struct page *page,
534  				       struct page_pool *pool)
535  {
536  	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
537  		recycle_stat_inc(pool, cache_full);
538  		return false;
539  	}
540  
541  	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
542  	pool->alloc.cache[pool->alloc.count++] = page;
543  	recycle_stat_inc(pool, cached);
544  	return true;
545  }
546  
547  /* If the page refcnt == 1, this will try to recycle the page.
548   * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
549   * the configured size min(dma_sync_size, pool->max_len).
550   * If the page refcnt != 1, then the page will be returned to memory
551   * subsystem.
552   */
553  static __always_inline struct page *
__page_pool_put_page(struct page_pool * pool,struct page * page,unsigned int dma_sync_size,bool allow_direct)554  __page_pool_put_page(struct page_pool *pool, struct page *page,
555  		     unsigned int dma_sync_size, bool allow_direct)
556  {
557  	/* This allocator is optimized for the XDP mode that uses
558  	 * one-frame-per-page, but have fallbacks that act like the
559  	 * regular page allocator APIs.
560  	 *
561  	 * refcnt == 1 means page_pool owns page, and can recycle it.
562  	 *
563  	 * page is NOT reusable when allocated when system is under
564  	 * some pressure. (page_is_pfmemalloc)
565  	 */
566  	if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
567  		/* Read barrier done in page_ref_count / READ_ONCE */
568  
569  		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
570  			page_pool_dma_sync_for_device(pool, page,
571  						      dma_sync_size);
572  
573  		if (allow_direct && in_softirq() &&
574  		    page_pool_recycle_in_cache(page, pool))
575  			return NULL;
576  
577  		/* Page found as candidate for recycling */
578  		return page;
579  	}
580  	/* Fallback/non-XDP mode: API user have elevated refcnt.
581  	 *
582  	 * Many drivers split up the page into fragments, and some
583  	 * want to keep doing this to save memory and do refcnt based
584  	 * recycling. Support this use case too, to ease drivers
585  	 * switching between XDP/non-XDP.
586  	 *
587  	 * In-case page_pool maintains the DMA mapping, API user must
588  	 * call page_pool_put_page once.  In this elevated refcnt
589  	 * case, the DMA is unmapped/released, as driver is likely
590  	 * doing refcnt based recycle tricks, meaning another process
591  	 * will be invoking put_page.
592  	 */
593  	recycle_stat_inc(pool, released_refcnt);
594  	/* Do not replace this with page_pool_return_page() */
595  	page_pool_release_page(pool, page);
596  	put_page(page);
597  
598  	return NULL;
599  }
600  
page_pool_put_defragged_page(struct page_pool * pool,struct page * page,unsigned int dma_sync_size,bool allow_direct)601  void page_pool_put_defragged_page(struct page_pool *pool, struct page *page,
602  				  unsigned int dma_sync_size, bool allow_direct)
603  {
604  	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
605  	if (page && !page_pool_recycle_in_ring(pool, page)) {
606  		/* Cache full, fallback to free pages */
607  		recycle_stat_inc(pool, ring_full);
608  		page_pool_return_page(pool, page);
609  	}
610  }
611  EXPORT_SYMBOL(page_pool_put_defragged_page);
612  
613  /* Caller must not use data area after call, as this function overwrites it */
page_pool_put_page_bulk(struct page_pool * pool,void ** data,int count)614  void page_pool_put_page_bulk(struct page_pool *pool, void **data,
615  			     int count)
616  {
617  	int i, bulk_len = 0;
618  
619  	for (i = 0; i < count; i++) {
620  		struct page *page = virt_to_head_page(data[i]);
621  
622  		/* It is not the last user for the page frag case */
623  		if (!page_pool_is_last_frag(pool, page))
624  			continue;
625  
626  		page = __page_pool_put_page(pool, page, -1, false);
627  		/* Approved for bulk recycling in ptr_ring cache */
628  		if (page)
629  			data[bulk_len++] = page;
630  	}
631  
632  	if (unlikely(!bulk_len))
633  		return;
634  
635  	/* Bulk producer into ptr_ring page_pool cache */
636  	page_pool_ring_lock(pool);
637  	for (i = 0; i < bulk_len; i++) {
638  		if (__ptr_ring_produce(&pool->ring, data[i])) {
639  			/* ring full */
640  			recycle_stat_inc(pool, ring_full);
641  			break;
642  		}
643  	}
644  	recycle_stat_add(pool, ring, i);
645  	page_pool_ring_unlock(pool);
646  
647  	/* Hopefully all pages was return into ptr_ring */
648  	if (likely(i == bulk_len))
649  		return;
650  
651  	/* ptr_ring cache full, free remaining pages outside producer lock
652  	 * since put_page() with refcnt == 1 can be an expensive operation
653  	 */
654  	for (; i < bulk_len; i++)
655  		page_pool_return_page(pool, data[i]);
656  }
657  EXPORT_SYMBOL(page_pool_put_page_bulk);
658  
page_pool_drain_frag(struct page_pool * pool,struct page * page)659  static struct page *page_pool_drain_frag(struct page_pool *pool,
660  					 struct page *page)
661  {
662  	long drain_count = BIAS_MAX - pool->frag_users;
663  
664  	/* Some user is still using the page frag */
665  	if (likely(page_pool_defrag_page(page, drain_count)))
666  		return NULL;
667  
668  	if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
669  		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
670  			page_pool_dma_sync_for_device(pool, page, -1);
671  
672  		return page;
673  	}
674  
675  	page_pool_return_page(pool, page);
676  	return NULL;
677  }
678  
page_pool_free_frag(struct page_pool * pool)679  static void page_pool_free_frag(struct page_pool *pool)
680  {
681  	long drain_count = BIAS_MAX - pool->frag_users;
682  	struct page *page = pool->frag_page;
683  
684  	pool->frag_page = NULL;
685  
686  	if (!page || page_pool_defrag_page(page, drain_count))
687  		return;
688  
689  	page_pool_return_page(pool, page);
690  }
691  
page_pool_alloc_frag(struct page_pool * pool,unsigned int * offset,unsigned int size,gfp_t gfp)692  struct page *page_pool_alloc_frag(struct page_pool *pool,
693  				  unsigned int *offset,
694  				  unsigned int size, gfp_t gfp)
695  {
696  	unsigned int max_size = PAGE_SIZE << pool->p.order;
697  	struct page *page = pool->frag_page;
698  
699  	if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
700  		    size > max_size))
701  		return NULL;
702  
703  	size = ALIGN(size, dma_get_cache_alignment());
704  	*offset = pool->frag_offset;
705  
706  	if (page && *offset + size > max_size) {
707  		page = page_pool_drain_frag(pool, page);
708  		if (page) {
709  			alloc_stat_inc(pool, fast);
710  			goto frag_reset;
711  		}
712  	}
713  
714  	if (!page) {
715  		page = page_pool_alloc_pages(pool, gfp);
716  		if (unlikely(!page)) {
717  			pool->frag_page = NULL;
718  			return NULL;
719  		}
720  
721  		pool->frag_page = page;
722  
723  frag_reset:
724  		pool->frag_users = 1;
725  		*offset = 0;
726  		pool->frag_offset = size;
727  		page_pool_fragment_page(page, BIAS_MAX);
728  		return page;
729  	}
730  
731  	pool->frag_users++;
732  	pool->frag_offset = *offset + size;
733  	alloc_stat_inc(pool, fast);
734  	return page;
735  }
736  EXPORT_SYMBOL(page_pool_alloc_frag);
737  
page_pool_empty_ring(struct page_pool * pool)738  static void page_pool_empty_ring(struct page_pool *pool)
739  {
740  	struct page *page;
741  
742  	/* Empty recycle ring */
743  	while ((page = ptr_ring_consume_bh(&pool->ring))) {
744  		/* Verify the refcnt invariant of cached pages */
745  		if (!(page_ref_count(page) == 1))
746  			pr_crit("%s() page_pool refcnt %d violation\n",
747  				__func__, page_ref_count(page));
748  
749  		page_pool_return_page(pool, page);
750  	}
751  }
752  
page_pool_free(struct page_pool * pool)753  static void page_pool_free(struct page_pool *pool)
754  {
755  	if (pool->disconnect)
756  		pool->disconnect(pool);
757  
758  	ptr_ring_cleanup(&pool->ring, NULL);
759  
760  	if (pool->p.flags & PP_FLAG_DMA_MAP)
761  		put_device(pool->p.dev);
762  
763  #ifdef CONFIG_PAGE_POOL_STATS
764  	free_percpu(pool->recycle_stats);
765  #endif
766  	kfree(pool);
767  }
768  
page_pool_empty_alloc_cache_once(struct page_pool * pool)769  static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
770  {
771  	struct page *page;
772  
773  	if (pool->destroy_cnt)
774  		return;
775  
776  	/* Empty alloc cache, assume caller made sure this is
777  	 * no-longer in use, and page_pool_alloc_pages() cannot be
778  	 * call concurrently.
779  	 */
780  	while (pool->alloc.count) {
781  		page = pool->alloc.cache[--pool->alloc.count];
782  		page_pool_return_page(pool, page);
783  	}
784  }
785  
page_pool_scrub(struct page_pool * pool)786  static void page_pool_scrub(struct page_pool *pool)
787  {
788  	page_pool_empty_alloc_cache_once(pool);
789  	pool->destroy_cnt++;
790  
791  	/* No more consumers should exist, but producers could still
792  	 * be in-flight.
793  	 */
794  	page_pool_empty_ring(pool);
795  }
796  
page_pool_release(struct page_pool * pool)797  static int page_pool_release(struct page_pool *pool)
798  {
799  	int inflight;
800  
801  	page_pool_scrub(pool);
802  	inflight = page_pool_inflight(pool);
803  	if (!inflight)
804  		page_pool_free(pool);
805  
806  	return inflight;
807  }
808  
page_pool_release_retry(struct work_struct * wq)809  static void page_pool_release_retry(struct work_struct *wq)
810  {
811  	struct delayed_work *dwq = to_delayed_work(wq);
812  	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
813  	int inflight;
814  
815  	inflight = page_pool_release(pool);
816  	if (!inflight)
817  		return;
818  
819  	/* Periodic warning */
820  	if (time_after_eq(jiffies, pool->defer_warn)) {
821  		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
822  
823  		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
824  			__func__, inflight, sec);
825  		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
826  	}
827  
828  	/* Still not ready to be disconnected, retry later */
829  	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
830  }
831  
page_pool_use_xdp_mem(struct page_pool * pool,void (* disconnect)(void *),struct xdp_mem_info * mem)832  void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
833  			   struct xdp_mem_info *mem)
834  {
835  	refcount_inc(&pool->user_cnt);
836  	pool->disconnect = disconnect;
837  	pool->xdp_mem_id = mem->id;
838  }
839  
page_pool_destroy(struct page_pool * pool)840  void page_pool_destroy(struct page_pool *pool)
841  {
842  	if (!pool)
843  		return;
844  
845  	if (!page_pool_put(pool))
846  		return;
847  
848  	page_pool_free_frag(pool);
849  
850  	if (!page_pool_release(pool))
851  		return;
852  
853  	pool->defer_start = jiffies;
854  	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
855  
856  	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
857  	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
858  }
859  EXPORT_SYMBOL(page_pool_destroy);
860  
861  /* Caller must provide appropriate safe context, e.g. NAPI. */
page_pool_update_nid(struct page_pool * pool,int new_nid)862  void page_pool_update_nid(struct page_pool *pool, int new_nid)
863  {
864  	struct page *page;
865  
866  	trace_page_pool_update_nid(pool, new_nid);
867  	pool->p.nid = new_nid;
868  
869  	/* Flush pool alloc cache, as refill will check NUMA node */
870  	while (pool->alloc.count) {
871  		page = pool->alloc.cache[--pool->alloc.count];
872  		page_pool_return_page(pool, page);
873  	}
874  }
875  EXPORT_SYMBOL(page_pool_update_nid);
876  
page_pool_return_skb_page(struct page * page)877  bool page_pool_return_skb_page(struct page *page)
878  {
879  	struct page_pool *pp;
880  
881  	page = compound_head(page);
882  
883  	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
884  	 * in order to preserve any existing bits, such as bit 0 for the
885  	 * head page of compound page and bit 1 for pfmemalloc page, so
886  	 * mask those bits for freeing side when doing below checking,
887  	 * and page_is_pfmemalloc() is checked in __page_pool_put_page()
888  	 * to avoid recycling the pfmemalloc page.
889  	 */
890  	if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
891  		return false;
892  
893  	pp = page->pp;
894  
895  	/* Driver set this to memory recycling info. Reset it on recycle.
896  	 * This will *not* work for NIC using a split-page memory model.
897  	 * The page will be returned to the pool here regardless of the
898  	 * 'flipped' fragment being in use or not.
899  	 */
900  	page_pool_put_full_page(pp, page, false);
901  
902  	return true;
903  }
904  EXPORT_SYMBOL(page_pool_return_skb_page);
905