/****************************************************************************** * page_alloc.c * * Simple buddy heap allocator for Xen. * * Copyright (c) 2002-2004 K A Fraser * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; If not, see <http://www.gnu.org/licenses/>. */ /* * In general Xen maintains two pools of memory: * * - Xen heap: Memory which is always mapped (i.e accessible by * virtual address), via a permanent and contiguous * "direct mapping". Macros like va() and pa() are valid * for such memory and it is always permissible to stash * pointers to Xen heap memory in data structures etc. * * Xen heap pages are always anonymous (that is, not tied * or accounted to any particular domain). * * - Dom heap: Memory which must be explicitly mapped, usually * transiently with map_domain_page(), in order to be * used. va() and pa() are not valid for such memory. Care * should be taken when stashing pointers to dom heap * pages that those mappings are permanent (e.g. vmap() or * map_domain_page_global()), it is not safe to stash * transient mappings such as those from map_domain_page() * * Dom heap pages are often tied to a particular domain, * but need not be (passing domain==NULL results in an * anonymous dom heap allocation). * * The exact nature of this split is a (sub)arch decision which can * select one of three main variants: * * CONFIG_SEPARATE_XENHEAP=y * * The xen heap is maintained as an entirely separate heap. * * Arch code arranges for some (perhaps small) amount of physical * memory to be covered by a direct mapping and registers that * memory as the Xen heap (via init_xenheap_pages()) and the * remainder as the dom heap. * * This mode of operation is most commonly used by 32-bit arches * where the virtual address space is insufficient to map all RAM. * * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ALL RAM * * All of RAM is covered by a permanent contiguous mapping and there * is only a single heap. * * Memory allocated from the Xen heap is flagged (in * page_info.count_info) with PGC_xen_heap. Memory allocated from * the Dom heap must still be explicitly mapped before use * (e.g. with map_domain_page) in particular in common code. * * xenheap_max_mfn() should not be called by arch code. * * This mode of operation is most commonly used by 64-bit arches * which have sufficient free virtual address space to permanently * map the largest practical amount RAM currently expected on that * arch. * * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ONLY PARTIAL RAM * * There is a single heap, but only the beginning (up to some * threshold) is covered by a permanent contiguous mapping. * * Memory allocated from the Xen heap is allocated from below the * threshold and flagged with PGC_xen_heap. Memory allocated from * the dom heap is allocated from anywhere in the heap (although it * will prefer to allocate from as high as possible to try and keep * Xen heap suitable memory available). * * Arch code must call xenheap_max_mfn() to signal the limit of the * direct mapping. * * This mode of operation is most commonly used by 64-bit arches * which have a restricted amount of virtual address space available * for a direct map (due to e.g. reservations for other purposes) * such that it is not possible to map all of RAM on systems with * the largest practical amount of RAM currently expected on that * arch. * * Boot Allocator * * In addition to the two primary pools (xen heap and dom heap) a * third "boot allocator" is used at start of day. This is a * simplified allocator which can be used. * * Typically all memory which is destined to be dom heap memory * (which is everything in the CONFIG_SEPARATE_XENHEAP=n * configurations) is first allocated to the boot allocator (with * init_boot_pages()) and is then handed over to the main dom heap in * end_boot_allocator(). * * "Contiguous" mappings * * Note that although the above talks about "contiguous" mappings * some architectures implement a scheme ("PDX compression") to * compress unused portions of the machine address space (i.e. large * gaps between distinct banks of memory) in order to avoid creating * enormous frame tables and direct maps which mostly map * nothing. Thus a contiguous mapping may still have distinct * regions within it. */ #include <xen/init.h> #include <xen/types.h> #include <xen/lib.h> #include <xen/sched.h> #include <xen/spinlock.h> #include <xen/mm.h> #include <xen/param.h> #include <xen/irq.h> #include <xen/softirq.h> #include <xen/domain_page.h> #include <xen/keyhandler.h> #include <xen/perfc.h> #include <xen/pfn.h> #include <xen/numa.h> #include <xen/nodemask.h> #include <xen/event.h> #include <public/sysctl.h> #include <public/sched.h> #include <asm/page.h> #include <asm/numa.h> #include <asm/flushtlb.h> #ifdef CONFIG_X86 #include <asm/guest.h> #include <asm/p2m.h> #include <asm/setup.h> /* for highmem_start only */ #include <asm/paging.h> #else #define p2m_pod_offline_or_broken_hit(pg) 0 #define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL) #endif /* * Comma-separated list of hexadecimal page numbers containing bad bytes. * e.g. 'badpage=0x3f45,0x8a321'. */ static char __initdata opt_badpage[100] = ""; string_param("badpage", opt_badpage); /* * no-bootscrub -> Free pages are not zeroed during boot. */ enum bootscrub_mode { BOOTSCRUB_OFF, BOOTSCRUB_ON, BOOTSCRUB_IDLE, }; /* * opt_bootscrub should live in the init section, since it's not accessed * afterwards. However at least LLVM assumes there are no side effects of * accessing the variable, and optimizes the condition in init_heap_pages() so * opt_bootscrub is read regardless of the value of system_state: * https://bugs.llvm.org/show_bug.cgi?id=39707 */ static enum bootscrub_mode __read_mostly opt_bootscrub = BOOTSCRUB_IDLE; static int __init parse_bootscrub_param(const char *s) { /* Interpret 'bootscrub' alone in its positive boolean form */ if ( *s == '\0' ) { opt_bootscrub = BOOTSCRUB_ON; return 0; } switch ( parse_bool(s, NULL) ) { case 0: opt_bootscrub = BOOTSCRUB_OFF; break; case 1: opt_bootscrub = BOOTSCRUB_ON; break; default: if ( !strcmp(s, "idle") ) opt_bootscrub = BOOTSCRUB_IDLE; else return -EINVAL; break; } return 0; } custom_param("bootscrub", parse_bootscrub_param); /* * bootscrub_chunk -> Amount of bytes to scrub lockstep on non-SMT CPUs * on all NUMA nodes. */ static unsigned long __initdata opt_bootscrub_chunk = MB(128); size_param("bootscrub_chunk", opt_bootscrub_chunk); /* scrub-domheap -> Domheap pages are scrubbed when freed */ static bool __read_mostly opt_scrub_domheap; boolean_param("scrub-domheap", opt_scrub_domheap); #ifdef CONFIG_SCRUB_DEBUG static bool __read_mostly scrub_debug; #else #define scrub_debug false #endif /* * Bit width of the DMA heap -- used to override NUMA-node-first. * allocation strategy, which can otherwise exhaust low memory. */ static unsigned int dma_bitsize; integer_param("dma_bits", dma_bitsize); /* Offlined page list, protected by heap_lock. */ PAGE_LIST_HEAD(page_offlined_list); /* Broken page list, protected by heap_lock. */ PAGE_LIST_HEAD(page_broken_list); /************************* * BOOT-TIME ALLOCATOR */ /* * first_valid_mfn is exported because it is use in ARM specific NUMA * helpers. See comment in asm-arm/numa.h. */ mfn_t first_valid_mfn = INVALID_MFN_INITIALIZER; struct bootmem_region { unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */ }; /* Statically allocate a page for bootmem_region_list. */ static struct bootmem_region __initdata bootmem_region_list[PAGE_SIZE / sizeof(struct bootmem_region)]; static unsigned int __initdata nr_bootmem_regions; struct scrub_region { unsigned long offset; unsigned long start; unsigned long per_cpu_sz; unsigned long rem; cpumask_t cpus; }; static struct scrub_region __initdata region[MAX_NUMNODES]; static unsigned long __initdata chunk_size; static void __init bootmem_region_add(unsigned long s, unsigned long e) { unsigned int i; if ( s >= e ) return; for ( i = 0; i < nr_bootmem_regions; i++ ) if ( s < bootmem_region_list[i].e ) break; BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s)); BUG_ON(nr_bootmem_regions == (PAGE_SIZE / sizeof(struct bootmem_region))); memmove(&bootmem_region_list[i+1], &bootmem_region_list[i], (nr_bootmem_regions - i) * sizeof(*bootmem_region_list)); bootmem_region_list[i] = (struct bootmem_region) { s, e }; nr_bootmem_regions++; } static void __init bootmem_region_zap(unsigned long s, unsigned long e) { unsigned int i; for ( i = 0; i < nr_bootmem_regions; i++ ) { struct bootmem_region *r = &bootmem_region_list[i]; if ( e <= r->s ) break; if ( s >= r->e ) continue; if ( s <= r->s ) { r->s = min(e, r->e); } else if ( e >= r->e ) { r->e = s; } else { unsigned long _e = r->e; r->e = s; bootmem_region_add(e, _e); } } } void __init init_boot_pages(paddr_t ps, paddr_t pe) { unsigned long bad_spfn, bad_epfn; const char *p; #ifdef CONFIG_X86 const struct platform_bad_page *badpage; unsigned int i, array_size; BUILD_BUG_ON(8 * sizeof(frame_table->u.free.first_dirty) < MAX_ORDER + 1); #endif BUILD_BUG_ON(sizeof(frame_table->u) != sizeof(unsigned long)); ps = round_pgup(ps); pe = round_pgdown(pe); if ( pe <= ps ) return; first_valid_mfn = mfn_min(maddr_to_mfn(ps), first_valid_mfn); bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT); #ifdef CONFIG_X86 /* * Here we put platform-specific memory range workarounds, i.e. * memory known to be corrupt or otherwise in need to be reserved on * specific platforms. * We get these certain pages and remove them from memory region list. */ badpage = get_platform_badpages(&array_size); if ( badpage ) { for ( i = 0; i < array_size; i++ ) { bootmem_region_zap(badpage->mfn, badpage->mfn + (1UL << badpage->order)); badpage++; } } if ( pv_shim ) { badpage = pv_shim_reserved_pages(&array_size); if ( badpage ) { for ( i = 0; i < array_size; i++ ) { bootmem_region_zap(badpage->mfn, badpage->mfn + (1UL << badpage->order)); badpage++; } } } #endif /* Check new pages against the bad-page list. */ p = opt_badpage; while ( *p != '\0' ) { bad_spfn = simple_strtoul(p, &p, 0); bad_epfn = bad_spfn; if ( *p == '-' ) { p++; bad_epfn = simple_strtoul(p, &p, 0); if ( bad_epfn < bad_spfn ) bad_epfn = bad_spfn; } if ( *p == ',' ) p++; else if ( *p != '\0' ) break; bootmem_region_zap(bad_spfn, bad_epfn+1); } } mfn_t __init alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align) { unsigned long pg, _e; unsigned int i = nr_bootmem_regions; BUG_ON(!nr_bootmem_regions); while ( i-- ) { struct bootmem_region *r = &bootmem_region_list[i]; pg = (r->e - nr_pfns) & ~(pfn_align - 1); if ( pg >= r->e || pg < r->s ) continue; #if defined(CONFIG_X86) && !defined(NDEBUG) /* * Filtering pfn_align == 1 since the only allocations using a bigger * alignment are the ones used for setting up the frame table chunks. * Those allocations get remapped anyway, i.e. them not having 1:1 * mappings always accessible is not a problem. */ if ( highmem_start && pfn_align == 1 && r->e > PFN_DOWN(highmem_start) ) { pg = r->s; if ( pg + nr_pfns > PFN_DOWN(highmem_start) ) continue; r->s = pg + nr_pfns; return _mfn(pg); } #endif _e = r->e; r->e = pg; bootmem_region_add(pg + nr_pfns, _e); return _mfn(pg); } BUG(); } /************************* * BINARY BUDDY ALLOCATOR */ #define MEMZONE_XEN 0 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT + 1) #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT)) #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \ (flsl(mfn_x(page_to_mfn(pg))) ? : 1)) typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1]; static heap_by_zone_and_order_t *_heap[MAX_NUMNODES]; #define heap(node, zone, order) ((*_heap[node])[zone][order]) static unsigned long node_need_scrub[MAX_NUMNODES]; static unsigned long *avail[MAX_NUMNODES]; static long total_avail_pages; static DEFINE_SPINLOCK(heap_lock); static long outstanding_claims; /* total outstanding claims by all domains */ unsigned long domain_adjust_tot_pages(struct domain *d, long pages) { long dom_before, dom_after, dom_claimed, sys_before, sys_after; ASSERT(spin_is_locked(&d->page_alloc_lock)); d->tot_pages += pages; /* * can test d->claimed_pages race-free because it can only change * if d->page_alloc_lock and heap_lock are both held, see also * domain_set_outstanding_pages below */ if ( !d->outstanding_pages ) goto out; spin_lock(&heap_lock); /* adjust domain outstanding pages; may not go negative */ dom_before = d->outstanding_pages; dom_after = dom_before - pages; BUG_ON(dom_before < 0); dom_claimed = dom_after < 0 ? 0 : dom_after; d->outstanding_pages = dom_claimed; /* flag accounting bug if system outstanding_claims would go negative */ sys_before = outstanding_claims; sys_after = sys_before - (dom_before - dom_claimed); BUG_ON(sys_after < 0); outstanding_claims = sys_after; spin_unlock(&heap_lock); out: return d->tot_pages; } int domain_set_outstanding_pages(struct domain *d, unsigned long pages) { int ret = -ENOMEM; unsigned long claim, avail_pages; /* * take the domain's page_alloc_lock, else all d->tot_page adjustments * must always take the global heap_lock rather than only in the much * rarer case that d->outstanding_pages is non-zero */ spin_lock(&d->page_alloc_lock); spin_lock(&heap_lock); /* pages==0 means "unset" the claim. */ if ( pages == 0 ) { outstanding_claims -= d->outstanding_pages; d->outstanding_pages = 0; ret = 0; goto out; } /* only one active claim per domain please */ if ( d->outstanding_pages ) { ret = -EINVAL; goto out; } /* disallow a claim not exceeding domain_tot_pages() or above max_pages */ if ( (pages <= domain_tot_pages(d)) || (pages > d->max_pages) ) { ret = -EINVAL; goto out; } /* how much memory is available? */ avail_pages = total_avail_pages; avail_pages -= outstanding_claims; /* * Note, if domain has already allocated memory before making a claim * then the claim must take domain_tot_pages() into account */ claim = pages - domain_tot_pages(d); if ( claim > avail_pages ) goto out; /* yay, claim fits in available memory, stake the claim, success! */ d->outstanding_pages = claim; outstanding_claims += d->outstanding_pages; ret = 0; out: spin_unlock(&heap_lock); spin_unlock(&d->page_alloc_lock); return ret; } void get_outstanding_claims(uint64_t *free_pages, uint64_t *outstanding_pages) { spin_lock(&heap_lock); *outstanding_pages = outstanding_claims; *free_pages = avail_domheap_pages(); spin_unlock(&heap_lock); } static bool __read_mostly first_node_initialised; #ifndef CONFIG_SEPARATE_XENHEAP static unsigned int __read_mostly xenheap_bits; #else #define xenheap_bits 0 #endif static unsigned long init_node_heap(int node, unsigned long mfn, unsigned long nr, bool *use_tail) { /* First node to be discovered has its heap metadata statically alloced. */ static heap_by_zone_and_order_t _heap_static; static unsigned long avail_static[NR_ZONES]; unsigned long needed = (sizeof(**_heap) + sizeof(**avail) * NR_ZONES + PAGE_SIZE - 1) >> PAGE_SHIFT; int i, j; if ( !first_node_initialised ) { _heap[node] = &_heap_static; avail[node] = avail_static; first_node_initialised = true; needed = 0; } else if ( *use_tail && nr >= needed && arch_mfn_in_directmap(mfn + nr) && (!xenheap_bits || !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) ) { _heap[node] = mfn_to_virt(mfn + nr - needed); avail[node] = mfn_to_virt(mfn + nr - 1) + PAGE_SIZE - sizeof(**avail) * NR_ZONES; } else if ( nr >= needed && arch_mfn_in_directmap(mfn + needed) && (!xenheap_bits || !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) ) { _heap[node] = mfn_to_virt(mfn); avail[node] = mfn_to_virt(mfn + needed - 1) + PAGE_SIZE - sizeof(**avail) * NR_ZONES; *use_tail = false; } else if ( get_order_from_bytes(sizeof(**_heap)) == get_order_from_pages(needed) ) { _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0); BUG_ON(!_heap[node]); avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) - sizeof(**avail) * NR_ZONES; needed = 0; } else { _heap[node] = xmalloc(heap_by_zone_and_order_t); avail[node] = xmalloc_array(unsigned long, NR_ZONES); BUG_ON(!_heap[node] || !avail[node]); needed = 0; } memset(avail[node], 0, NR_ZONES * sizeof(long)); for ( i = 0; i < NR_ZONES; i++ ) for ( j = 0; j <= MAX_ORDER; j++ ) INIT_PAGE_LIST_HEAD(&heap(node, i, j)); return needed; } /* Default to 64 MiB */ #define DEFAULT_LOW_MEM_VIRQ (((paddr_t) 64) << 20) #define MAX_LOW_MEM_VIRQ (((paddr_t) 1024) << 20) static paddr_t __read_mostly opt_low_mem_virq = ((paddr_t) -1); size_param("low_mem_virq_limit", opt_low_mem_virq); /* Thresholds to control hysteresis. In pages */ /* When memory grows above this threshold, reset hysteresis. * -1 initially to not reset until at least one virq issued. */ static unsigned long low_mem_virq_high = -1UL; /* Threshold at which we issue virq */ static unsigned long low_mem_virq_th = 0; /* Original threshold after all checks completed */ static unsigned long low_mem_virq_orig = 0; /* Order for current threshold */ static unsigned int low_mem_virq_th_order = 0; /* Perform bootstrapping checks and set bounds */ static void __init setup_low_mem_virq(void) { unsigned int order; paddr_t threshold; bool halve; /* If the user specifies zero, then he/she doesn't want this virq * to ever trigger. */ if ( opt_low_mem_virq == 0 ) { low_mem_virq_th = -1UL; return; } /* If the user did not specify a knob, remember that */ halve = (opt_low_mem_virq == ((paddr_t) -1)); threshold = halve ? DEFAULT_LOW_MEM_VIRQ : opt_low_mem_virq; /* Dom0 has already been allocated by now. So check we won't be * complaining immediately with whatever's left of the heap. */ threshold = min(threshold, ((paddr_t) total_avail_pages) << PAGE_SHIFT); /* Then, cap to some predefined maximum */ threshold = min(threshold, MAX_LOW_MEM_VIRQ); /* If the user specified no knob, and we are at the current available * level, halve the threshold. */ if ( halve && (threshold == (((paddr_t) total_avail_pages) << PAGE_SHIFT)) ) threshold >>= 1; /* Zero? Have to fire immediately */ threshold = max(threshold, (paddr_t) PAGE_SIZE); /* Threshold bytes -> pages */ low_mem_virq_th = threshold >> PAGE_SHIFT; /* Next, round the threshold down to the next order */ order = get_order_from_pages(low_mem_virq_th); if ( (1UL << order) > low_mem_virq_th ) order--; /* Set bounds, ready to go */ low_mem_virq_th = low_mem_virq_orig = 1UL << order; low_mem_virq_th_order = order; printk("Initial low memory virq threshold set at %#lx pages.\n", low_mem_virq_th); } static void check_low_mem_virq(void) { unsigned long avail_pages = total_avail_pages - outstanding_claims; if ( unlikely(avail_pages <= low_mem_virq_th) ) { send_global_virq(VIRQ_ENOMEM); /* Update thresholds. Next warning will be when we drop below * next order. However, we wait until we grow beyond one * order above us to complain again at the current order */ low_mem_virq_high = 1UL << (low_mem_virq_th_order + 1); if ( low_mem_virq_th_order > 0 ) low_mem_virq_th_order--; low_mem_virq_th = 1UL << low_mem_virq_th_order; return; } if ( unlikely(avail_pages >= low_mem_virq_high) ) { /* Reset hysteresis. Bring threshold up one order. * If we are back where originally set, set high * threshold to -1 to avoid further growth of * virq threshold. */ low_mem_virq_th_order++; low_mem_virq_th = 1UL << low_mem_virq_th_order; if ( low_mem_virq_th == low_mem_virq_orig ) low_mem_virq_high = -1UL; else low_mem_virq_high = 1UL << (low_mem_virq_th_order + 2); } } /* Pages that need a scrub are added to tail, otherwise to head. */ static void page_list_add_scrub(struct page_info *pg, unsigned int node, unsigned int zone, unsigned int order, unsigned int first_dirty) { PFN_ORDER(pg) = order; pg->u.free.first_dirty = first_dirty; pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING; if ( first_dirty != INVALID_DIRTY_IDX ) { ASSERT(first_dirty < (1U << order)); page_list_add_tail(pg, &heap(node, zone, order)); } else page_list_add(pg, &heap(node, zone, order)); } /* SCRUB_PATTERN needs to be a repeating series of bytes. */ #ifndef NDEBUG #define SCRUB_PATTERN 0xc2c2c2c2c2c2c2c2ULL #else #define SCRUB_PATTERN 0ULL #endif #define SCRUB_BYTE_PATTERN (SCRUB_PATTERN & 0xff) static void poison_one_page(struct page_info *pg) { #ifdef CONFIG_SCRUB_DEBUG uint64_t *ptr; if ( !scrub_debug ) return; ptr = __map_domain_page(pg); *ptr = ~SCRUB_PATTERN; unmap_domain_page(ptr); #endif } static void check_one_page(struct page_info *pg) { #ifdef CONFIG_SCRUB_DEBUG const uint64_t *ptr; unsigned int i; if ( !scrub_debug ) return; ptr = __map_domain_page(pg); for ( i = 0; i < PAGE_SIZE / sizeof (*ptr); i++ ) BUG_ON(ptr[i] != SCRUB_PATTERN); unmap_domain_page(ptr); #endif } static void check_and_stop_scrub(struct page_info *head) { if ( head->u.free.scrub_state == BUDDY_SCRUBBING ) { typeof(head->u.free) pgfree; head->u.free.scrub_state = BUDDY_SCRUB_ABORT; spin_lock_kick(); for ( ; ; ) { /* Can't ACCESS_ONCE() a bitfield. */ pgfree.val = ACCESS_ONCE(head->u.free.val); if ( pgfree.scrub_state != BUDDY_SCRUB_ABORT ) break; cpu_relax(); } } } static struct page_info *get_free_buddy(unsigned int zone_lo, unsigned int zone_hi, unsigned int order, unsigned int memflags, const struct domain *d) { nodeid_t first, node = MEMF_get_node(memflags), req_node = node; nodemask_t nodemask = node_online_map; unsigned int j, zone, nodemask_retry = 0; struct page_info *pg; bool use_unscrubbed = (memflags & MEMF_no_scrub); /* * d->node_affinity is our preferred allocation set if provided, but it * may have bits set outside of node_online_map. Clamp it. */ if ( d ) { /* * It is the callers responsibility to ensure that d->node_affinity * isn't complete junk. */ if ( nodes_intersects(nodemask, d->node_affinity) ) nodes_and(nodemask, nodemask, d->node_affinity); else ASSERT_UNREACHABLE(); } if ( node == NUMA_NO_NODE ) { if ( d != NULL ) node = cycle_node(d->last_alloc_node, nodemask); if ( node >= MAX_NUMNODES ) node = cpu_to_node(smp_processor_id()); } else if ( unlikely(node >= MAX_NUMNODES) ) { ASSERT_UNREACHABLE(); return NULL; } first = node; /* * Start with requested node, but exhaust all node memory in requested * zone before failing, only calc new node value if we fail to find memory * in target node, this avoids needless computation on fast-path. */ for ( ; ; ) { zone = zone_hi; do { /* Check if target node can support the allocation. */ if ( !avail[node] || (avail[node][zone] < (1UL << order)) ) continue; /* Find smallest order which can satisfy the request. */ for ( j = order; j <= MAX_ORDER; j++ ) { if ( (pg = page_list_remove_head(&heap(node, zone, j))) ) { if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX ) return pg; /* * We grab single pages (order=0) even if they are * unscrubbed. Given that scrubbing one page is fairly quick * it is not worth breaking higher orders. */ if ( (order == 0) || use_unscrubbed ) { check_and_stop_scrub(pg); return pg; } page_list_add_tail(pg, &heap(node, zone, j)); } } } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */ if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE ) return NULL; /* Pick next node. */ if ( !nodemask_test(node, &nodemask) ) { /* Very first node may be caller-specified and outside nodemask. */ ASSERT(!nodemask_retry); first = node = first_node(nodemask); if ( node < MAX_NUMNODES ) continue; } else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES ) node = first_node(nodemask); if ( node == first ) { /* When we have tried all in nodemask, we fall back to others. */ if ( (memflags & MEMF_exact_node) || nodemask_retry++ ) return NULL; nodes_andnot(nodemask, node_online_map, nodemask); first = node = first_node(nodemask); if ( node >= MAX_NUMNODES ) return NULL; } } } /* Allocate 2^@order contiguous pages. */ static struct page_info *alloc_heap_pages( unsigned int zone_lo, unsigned int zone_hi, unsigned int order, unsigned int memflags, struct domain *d) { nodeid_t node; unsigned int i, buddy_order, zone, first_dirty; unsigned long request = 1UL << order; struct page_info *pg; bool need_tlbflush = false; uint32_t tlbflush_timestamp = 0; unsigned int dirty_cnt = 0; /* Make sure there are enough bits in memflags for nodeID. */ BUILD_BUG_ON((_MEMF_bits - _MEMF_node) < (8 * sizeof(nodeid_t))); ASSERT(zone_lo <= zone_hi); ASSERT(zone_hi < NR_ZONES); if ( unlikely(order > MAX_ORDER) ) return NULL; spin_lock(&heap_lock); /* * Claimed memory is considered unavailable unless the request * is made by a domain with sufficient unclaimed pages. */ if ( (outstanding_claims + request > total_avail_pages) && ((memflags & MEMF_no_refcount) || !d || d->outstanding_pages < request) ) { spin_unlock(&heap_lock); return NULL; } pg = get_free_buddy(zone_lo, zone_hi, order, memflags, d); /* Try getting a dirty buddy if we couldn't get a clean one. */ if ( !pg && !(memflags & MEMF_no_scrub) ) pg = get_free_buddy(zone_lo, zone_hi, order, memflags | MEMF_no_scrub, d); if ( !pg ) { /* No suitable memory blocks. Fail the request. */ spin_unlock(&heap_lock); return NULL; } node = phys_to_nid(page_to_maddr(pg)); zone = page_to_zone(pg); buddy_order = PFN_ORDER(pg); first_dirty = pg->u.free.first_dirty; /* We may have to halve the chunk a number of times. */ while ( buddy_order != order ) { buddy_order--; page_list_add_scrub(pg, node, zone, buddy_order, (1U << buddy_order) > first_dirty ? first_dirty : INVALID_DIRTY_IDX); pg += 1U << buddy_order; if ( first_dirty != INVALID_DIRTY_IDX ) { /* Adjust first_dirty */ if ( first_dirty >= 1U << buddy_order ) first_dirty -= 1U << buddy_order; else first_dirty = 0; /* We've moved past original first_dirty */ } } ASSERT(avail[node][zone] >= request); avail[node][zone] -= request; total_avail_pages -= request; ASSERT(total_avail_pages >= 0); check_low_mem_virq(); if ( d != NULL ) d->last_alloc_node = node; for ( i = 0; i < (1 << order); i++ ) { /* Reference count must continuously be zero for free pages. */ if ( (pg[i].count_info & ~PGC_need_scrub) != PGC_state_free ) { printk(XENLOG_ERR "pg[%u] MFN %"PRI_mfn" c=%#lx o=%u v=%#lx t=%#x\n", i, mfn_x(page_to_mfn(pg + i)), pg[i].count_info, pg[i].v.free.order, pg[i].u.free.val, pg[i].tlbflush_timestamp); BUG(); } /* PGC_need_scrub can only be set if first_dirty is valid */ ASSERT(first_dirty != INVALID_DIRTY_IDX || !(pg[i].count_info & PGC_need_scrub)); /* Preserve PGC_need_scrub so we can check it after lock is dropped. */ pg[i].count_info = PGC_state_inuse | (pg[i].count_info & PGC_need_scrub); if ( !(memflags & MEMF_no_tlbflush) ) accumulate_tlbflush(&need_tlbflush, &pg[i], &tlbflush_timestamp); /* Initialise fields which have other uses for free pages. */ pg[i].u.inuse.type_info = 0; page_set_owner(&pg[i], NULL); /* Ensure cache and RAM are consistent for platforms where the * guest can control its own visibility of/through the cache. */ flush_page_to_ram(mfn_x(page_to_mfn(&pg[i])), !(memflags & MEMF_no_icache_flush)); } spin_unlock(&heap_lock); if ( first_dirty != INVALID_DIRTY_IDX || (scrub_debug && !(memflags & MEMF_no_scrub)) ) { for ( i = 0; i < (1U << order); i++ ) { if ( test_bit(_PGC_need_scrub, &pg[i].count_info) ) { if ( !(memflags & MEMF_no_scrub) ) scrub_one_page(&pg[i]); dirty_cnt++; spin_lock(&heap_lock); pg[i].count_info &= ~PGC_need_scrub; spin_unlock(&heap_lock); } else if ( !(memflags & MEMF_no_scrub) ) check_one_page(&pg[i]); } if ( dirty_cnt ) { spin_lock(&heap_lock); node_need_scrub[node] -= dirty_cnt; spin_unlock(&heap_lock); } } if ( need_tlbflush ) filtered_flush_tlb_mask(tlbflush_timestamp); return pg; } /* Remove any offlined page in the buddy pointed to by head. */ static int reserve_offlined_page(struct page_info *head) { unsigned int node = phys_to_nid(page_to_maddr(head)); int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0; struct page_info *cur_head; unsigned int cur_order, first_dirty; ASSERT(spin_is_locked(&heap_lock)); cur_head = head; check_and_stop_scrub(head); /* * We may break the buddy so let's mark the head as clean. Then, when * merging chunks back into the heap, we will see whether the chunk has * unscrubbed pages and set its first_dirty properly. */ first_dirty = head->u.free.first_dirty; head->u.free.first_dirty = INVALID_DIRTY_IDX; page_list_del(head, &heap(node, zone, head_order)); while ( cur_head < (head + (1 << head_order)) ) { struct page_info *pg; int next_order; if ( page_state_is(cur_head, offlined) ) { cur_head++; if ( first_dirty != INVALID_DIRTY_IDX && first_dirty ) first_dirty--; continue; } next_order = cur_order = 0; while ( cur_order < head_order ) { next_order = cur_order + 1; if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) ) goto merge; for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order ); i < (1 << next_order); i++, pg++ ) if ( page_state_is(pg, offlined) ) break; if ( i == ( 1 << next_order) ) { cur_order = next_order; continue; } else { merge: /* We don't consider merging outside the head_order. */ page_list_add_scrub(cur_head, node, zone, cur_order, (1U << cur_order) > first_dirty ? first_dirty : INVALID_DIRTY_IDX); cur_head += (1 << cur_order); /* Adjust first_dirty if needed. */ if ( first_dirty != INVALID_DIRTY_IDX ) { if ( first_dirty >= 1U << cur_order ) first_dirty -= 1U << cur_order; else first_dirty = 0; } break; } } } for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ ) { if ( !page_state_is(cur_head, offlined) ) continue; avail[node][zone]--; total_avail_pages--; ASSERT(total_avail_pages >= 0); page_list_add_tail(cur_head, test_bit(_PGC_broken, &cur_head->count_info) ? &page_broken_list : &page_offlined_list); count++; } return count; } static nodemask_t node_scrubbing; /* * If get_node is true this will return closest node that needs to be scrubbed, * with appropriate bit in node_scrubbing set. * If get_node is not set, this will return *a* node that needs to be scrubbed. * node_scrubbing bitmask will no be updated. * If no node needs scrubbing then NUMA_NO_NODE is returned. */ static unsigned int node_to_scrub(bool get_node) { nodeid_t node = cpu_to_node(smp_processor_id()), local_node; nodeid_t closest = NUMA_NO_NODE; u8 dist, shortest = 0xff; if ( node == NUMA_NO_NODE ) node = 0; if ( node_need_scrub[node] && (!get_node || !node_test_and_set(node, node_scrubbing)) ) return node; /* * See if there are memory-only nodes that need scrubbing and choose * the closest one. */ local_node = node; for ( ; ; ) { do { node = cycle_node(node, node_online_map); } while ( !cpumask_empty(&node_to_cpumask(node)) && (node != local_node) ); if ( node == local_node ) break; if ( node_need_scrub[node] ) { if ( !get_node ) return node; dist = __node_distance(local_node, node); /* * Grab the node right away. If we find a closer node later we will * release this one. While there is a chance that another CPU will * not be able to scrub that node when it is searching for scrub work * at the same time it will be able to do so next time it wakes up. * The alternative would be to perform this search under a lock but * then we'd need to take this lock every time we come in here. */ if ( (dist < shortest || closest == NUMA_NO_NODE) && !node_test_and_set(node, node_scrubbing) ) { if ( closest != NUMA_NO_NODE ) node_clear(closest, node_scrubbing); shortest = dist; closest = node; } } } return closest; } struct scrub_wait_state { struct page_info *pg; unsigned int first_dirty; bool drop; }; static void scrub_continue(void *data) { struct scrub_wait_state *st = data; if ( st->drop ) return; if ( st->pg->u.free.scrub_state == BUDDY_SCRUB_ABORT ) { /* There is a waiter for this buddy. Release it. */ st->drop = true; st->pg->u.free.first_dirty = st->first_dirty; smp_wmb(); st->pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING; } } bool scrub_free_pages(void) { struct page_info *pg; unsigned int zone; unsigned int cpu = smp_processor_id(); bool preempt = false; nodeid_t node; unsigned int cnt = 0; node = node_to_scrub(true); if ( node == NUMA_NO_NODE ) return false; spin_lock(&heap_lock); for ( zone = 0; zone < NR_ZONES; zone++ ) { unsigned int order = MAX_ORDER; do { while ( !page_list_empty(&heap(node, zone, order)) ) { unsigned int i, dirty_cnt; struct scrub_wait_state st; /* Unscrubbed pages are always at the end of the list. */ pg = page_list_last(&heap(node, zone, order)); if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX ) break; ASSERT(pg->u.free.scrub_state == BUDDY_NOT_SCRUBBING); pg->u.free.scrub_state = BUDDY_SCRUBBING; spin_unlock(&heap_lock); dirty_cnt = 0; for ( i = pg->u.free.first_dirty; i < (1U << order); i++) { if ( test_bit(_PGC_need_scrub, &pg[i].count_info) ) { scrub_one_page(&pg[i]); /* * We can modify count_info without holding heap * lock since we effectively locked this buddy by * setting its scrub_state. */ pg[i].count_info &= ~PGC_need_scrub; dirty_cnt++; cnt += 100; /* scrubbed pages add heavier weight. */ } else cnt++; if ( pg->u.free.scrub_state == BUDDY_SCRUB_ABORT ) { /* Someone wants this chunk. Drop everything. */ pg->u.free.first_dirty = (i == (1U << order) - 1) ? INVALID_DIRTY_IDX : i + 1; smp_wmb(); pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING; spin_lock(&heap_lock); node_need_scrub[node] -= dirty_cnt; spin_unlock(&heap_lock); goto out_nolock; } /* * Scrub a few (8) pages before becoming eligible for * preemption. But also count non-scrubbing loop iterations * so that we don't get stuck here with an almost clean * heap. */ if ( cnt > 800 && softirq_pending(cpu) ) { preempt = true; break; } } st.pg = pg; /* * get_free_buddy() grabs a buddy with first_dirty set to * INVALID_DIRTY_IDX so we can't set pg's first_dirty here. * It will be set either below or in the lock callback (in * scrub_continue()). */ st.first_dirty = (i >= (1U << order) - 1) ? INVALID_DIRTY_IDX : i + 1; st.drop = false; spin_lock_cb(&heap_lock, scrub_continue, &st); node_need_scrub[node] -= dirty_cnt; if ( st.drop ) goto out; if ( i >= (1U << order) - 1 ) { page_list_del(pg, &heap(node, zone, order)); page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX); } else pg->u.free.first_dirty = i + 1; pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING; if ( preempt || (node_need_scrub[node] == 0) ) goto out; } } while ( order-- != 0 ); } out: spin_unlock(&heap_lock); out_nolock: node_clear(node, node_scrubbing); return node_to_scrub(false) != NUMA_NO_NODE; } /* Free 2^@order set of pages. */ static void free_heap_pages( struct page_info *pg, unsigned int order, bool need_scrub) { unsigned long mask; mfn_t mfn = page_to_mfn(pg); unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0; unsigned int zone = page_to_zone(pg); ASSERT(order <= MAX_ORDER); ASSERT(node >= 0); spin_lock(&heap_lock); for ( i = 0; i < (1 << order); i++ ) { /* * Cannot assume that count_info == 0, as there are some corner cases * where it isn't the case and yet it isn't a bug: * 1. page_get_owner() is NULL * 2. page_get_owner() is a domain that was never accessible by * its domid (e.g., failed to fully construct the domain). * 3. page was never addressable by the guest (e.g., it's an * auto-translate-physmap guest and the page was never included * in its pseudophysical address space). * In all the above cases there can be no guest mappings of this page. */ switch ( pg[i].count_info & PGC_state ) { case PGC_state_inuse: BUG_ON(pg[i].count_info & PGC_broken); pg[i].count_info = PGC_state_free; break; case PGC_state_offlining: pg[i].count_info = (pg[i].count_info & PGC_broken) | PGC_state_offlined; tainted = 1; break; default: printk(XENLOG_ERR "pg[%u] MFN %"PRI_mfn" c=%#lx o=%u v=%#lx t=%#x\n", i, mfn_x(page_to_mfn(pg + i)), pg[i].count_info, pg[i].v.free.order, pg[i].u.free.val, pg[i].tlbflush_timestamp); BUG(); } /* If a page has no owner it will need no safety TLB flush. */ pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL); if ( pg[i].u.free.need_tlbflush ) page_set_tlbflush_timestamp(&pg[i]); /* This page is not a guest frame any more. */ page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */ set_gpfn_from_mfn(mfn_x(mfn) + i, INVALID_M2P_ENTRY); if ( need_scrub ) { pg[i].count_info |= PGC_need_scrub; poison_one_page(&pg[i]); } } avail[node][zone] += 1 << order; total_avail_pages += 1 << order; if ( need_scrub ) { node_need_scrub[node] += 1 << order; pg->u.free.first_dirty = 0; } else pg->u.free.first_dirty = INVALID_DIRTY_IDX; /* Merge chunks as far as possible. */ while ( order < MAX_ORDER ) { mask = 1UL << order; if ( (mfn_x(page_to_mfn(pg)) & mask) ) { struct page_info *predecessor = pg - mask; /* Merge with predecessor block? */ if ( !mfn_valid(page_to_mfn(predecessor)) || !page_state_is(predecessor, free) || (PFN_ORDER(predecessor) != order) || (phys_to_nid(page_to_maddr(predecessor)) != node) ) break; check_and_stop_scrub(predecessor); page_list_del(predecessor, &heap(node, zone, order)); /* Update predecessor's first_dirty if necessary. */ if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX && pg->u.free.first_dirty != INVALID_DIRTY_IDX ) predecessor->u.free.first_dirty = (1U << order) + pg->u.free.first_dirty; pg = predecessor; } else { struct page_info *successor = pg + mask; /* Merge with successor block? */ if ( !mfn_valid(page_to_mfn(successor)) || !page_state_is(successor, free) || (PFN_ORDER(successor) != order) || (phys_to_nid(page_to_maddr(successor)) != node) ) break; check_and_stop_scrub(successor); /* Update pg's first_dirty if necessary. */ if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX && successor->u.free.first_dirty != INVALID_DIRTY_IDX ) pg->u.free.first_dirty = (1U << order) + successor->u.free.first_dirty; page_list_del(successor, &heap(node, zone, order)); } order++; } page_list_add_scrub(pg, node, zone, order, pg->u.free.first_dirty); if ( tainted ) reserve_offlined_page(pg); spin_unlock(&heap_lock); } /* * Following rules applied for page offline: * Once a page is broken, it can't be assigned anymore * A page will be offlined only if it is free * return original count_info */ static unsigned long mark_page_offline(struct page_info *pg, int broken) { unsigned long nx, x, y = pg->count_info; ASSERT(page_is_ram_type(mfn_x(page_to_mfn(pg)), RAM_TYPE_CONVENTIONAL)); ASSERT(spin_is_locked(&heap_lock)); do { nx = x = y; if ( ((x & PGC_state) != PGC_state_offlined) && ((x & PGC_state) != PGC_state_offlining) ) { nx &= ~PGC_state; nx |= (((x & PGC_state) == PGC_state_free) ? PGC_state_offlined : PGC_state_offlining); } if ( broken ) nx |= PGC_broken; if ( x == nx ) break; } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x ); return y; } static int reserve_heap_page(struct page_info *pg) { struct page_info *head = NULL; unsigned int i, node = phys_to_nid(page_to_maddr(pg)); unsigned int zone = page_to_zone(pg); for ( i = 0; i <= MAX_ORDER; i++ ) { struct page_info *tmp; if ( page_list_empty(&heap(node, zone, i)) ) continue; page_list_for_each_safe ( head, tmp, &heap(node, zone, i) ) { if ( (head <= pg) && (head + (1UL << i) > pg) ) return reserve_offlined_page(head); } } return -EINVAL; } int offline_page(mfn_t mfn, int broken, uint32_t *status) { unsigned long old_info = 0; struct domain *owner; struct page_info *pg; if ( !mfn_valid(mfn) ) { dprintk(XENLOG_WARNING, "try to offline out of range page %"PRI_mfn"\n", mfn_x(mfn)); return -EINVAL; } *status = 0; pg = mfn_to_page(mfn); if ( is_xen_fixed_mfn(mfn) ) { *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED | (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT); return -EPERM; } /* * N.B. xen's txt in x86_64 is marked reserved and handled already. * Also kexec range is reserved. */ if ( !page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL) ) { *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM; return -EINVAL; } /* * NB. When broken page belong to guest, usually hypervisor will * notify the guest to handle the broken page. However, hypervisor * need to prevent malicious guest access the broken page again. * Under such case, hypervisor shutdown guest, preventing recursive mce. */ if ( (pg->count_info & PGC_broken) && (owner = page_get_owner(pg)) ) { *status = PG_OFFLINE_AGAIN; domain_crash(owner); return 0; } spin_lock(&heap_lock); old_info = mark_page_offline(pg, broken); if ( page_state_is(pg, offlined) ) { reserve_heap_page(pg); spin_unlock(&heap_lock); *status = broken ? PG_OFFLINE_OFFLINED | PG_OFFLINE_BROKEN : PG_OFFLINE_OFFLINED; return 0; } spin_unlock(&heap_lock); if ( (owner = page_get_owner_and_reference(pg)) ) { if ( p2m_pod_offline_or_broken_hit(pg) ) { put_page(pg); p2m_pod_offline_or_broken_replace(pg); *status = PG_OFFLINE_OFFLINED; } else { *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING | (owner->domain_id << PG_OFFLINE_OWNER_SHIFT); /* Release the reference since it will not be allocated anymore */ put_page(pg); } } else if ( old_info & PGC_xen_heap ) { *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING | (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT); } else { /* * assign_pages does not hold heap_lock, so small window that the owner * may be set later, but please notice owner will only change from * NULL to be set, not verse, since page is offlining now. * No windows If called from #MC handler, since all CPU are in softirq * If called from user space like CE handling, tools can wait some time * before call again. */ *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED | (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT ); } if ( broken ) *status |= PG_OFFLINE_BROKEN; return 0; } /* * Online the memory. * The caller should make sure end_pfn <= max_page, * if not, expand_pages() should be called prior to online_page(). */ unsigned int online_page(mfn_t mfn, uint32_t *status) { unsigned long x, nx, y; struct page_info *pg; int ret; if ( !mfn_valid(mfn) ) { dprintk(XENLOG_WARNING, "call expand_pages() first\n"); return -EINVAL; } pg = mfn_to_page(mfn); spin_lock(&heap_lock); y = pg->count_info; do { ret = *status = 0; if ( y & PGC_broken ) { ret = -EINVAL; *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN; break; } if ( (y & PGC_state) == PGC_state_offlined ) { page_list_del(pg, &page_offlined_list); *status = PG_ONLINE_ONLINED; } else if ( (y & PGC_state) == PGC_state_offlining ) { *status = PG_ONLINE_ONLINED; } else { break; } x = y; nx = (x & ~PGC_state) | PGC_state_inuse; } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x ); spin_unlock(&heap_lock); if ( (y & PGC_state) == PGC_state_offlined ) free_heap_pages(pg, 0, false); return ret; } int query_page_offline(mfn_t mfn, uint32_t *status) { struct page_info *pg; if ( !mfn_valid(mfn) || !page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL) ) { dprintk(XENLOG_WARNING, "call expand_pages() first\n"); return -EINVAL; } *status = 0; spin_lock(&heap_lock); pg = mfn_to_page(mfn); if ( page_state_is(pg, offlining) ) *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING; if ( pg->count_info & PGC_broken ) *status |= PG_OFFLINE_STATUS_BROKEN; if ( page_state_is(pg, offlined) ) *status |= PG_OFFLINE_STATUS_OFFLINED; spin_unlock(&heap_lock); return 0; } /* * Hand the specified arbitrary page range to the specified heap zone * checking the node_id of the previous page. If they differ and the * latter is not on a MAX_ORDER boundary, then we reserve the page by * not freeing it to the buddy allocator. */ static void init_heap_pages( struct page_info *pg, unsigned long nr_pages) { unsigned long i; bool idle_scrub = false; /* * Keep MFN 0 away from the buddy allocator to avoid crossing zone * boundary when merging two buddies. */ if ( !mfn_x(page_to_mfn(pg)) ) { if ( nr_pages-- <= 1 ) return; pg++; } /* * Some pages may not go through the boot allocator (e.g reserved * memory at boot but released just after --- kernel, initramfs, * etc.). * Update first_valid_mfn to ensure those regions are covered. */ spin_lock(&heap_lock); first_valid_mfn = mfn_min(page_to_mfn(pg), first_valid_mfn); spin_unlock(&heap_lock); if ( system_state < SYS_STATE_active && opt_bootscrub == BOOTSCRUB_IDLE ) idle_scrub = true; for ( i = 0; i < nr_pages; i++ ) { unsigned int nid = phys_to_nid(page_to_maddr(pg+i)); if ( unlikely(!avail[nid]) ) { unsigned long s = mfn_x(page_to_mfn(pg + i)); unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1)); bool use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) && !(s & ((1UL << MAX_ORDER) - 1)) && (find_first_set_bit(e) <= find_first_set_bit(s)); unsigned long n; n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), nr_pages - i, &use_tail); BUG_ON(i + n > nr_pages); if ( n && !use_tail ) { i += n - 1; continue; } if ( i + n == nr_pages ) break; nr_pages -= n; } free_heap_pages(pg + i, 0, scrub_debug || idle_scrub); } } static unsigned long avail_heap_pages( unsigned int zone_lo, unsigned int zone_hi, unsigned int node) { unsigned int i, zone; unsigned long free_pages = 0; if ( zone_hi >= NR_ZONES ) zone_hi = NR_ZONES - 1; for_each_online_node(i) { if ( !avail[i] ) continue; for ( zone = zone_lo; zone <= zone_hi; zone++ ) if ( (node == -1) || (node == i) ) free_pages += avail[i][zone]; } return free_pages; } void __init end_boot_allocator(void) { unsigned int i; /* Pages that are free now go to the domain sub-allocator. */ for ( i = 0; i < nr_bootmem_regions; i++ ) { struct bootmem_region *r = &bootmem_region_list[i]; if ( (r->s < r->e) && (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) ) { init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s); r->e = r->s; break; } } for ( i = nr_bootmem_regions; i-- > 0; ) { struct bootmem_region *r = &bootmem_region_list[i]; if ( r->s < r->e ) init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s); } nr_bootmem_regions = 0; if ( !dma_bitsize && (num_online_nodes() > 1) ) dma_bitsize = arch_get_dma_bitsize(); printk("Domain heap initialised"); if ( dma_bitsize ) printk(" DMA width %u bits", dma_bitsize); printk("\n"); } static void __init smp_scrub_heap_pages(void *data) { unsigned long mfn, start, end; struct page_info *pg; struct scrub_region *r; unsigned int temp_cpu, cpu_idx = 0; nodeid_t node; unsigned int cpu = smp_processor_id(); if ( data ) r = data; else { node = cpu_to_node(cpu); if ( node == NUMA_NO_NODE ) return; r = ®ion[node]; } /* Determine the current CPU's index into CPU's linked to this node. */ for_each_cpu ( temp_cpu, &r->cpus ) { if ( cpu == temp_cpu ) break; cpu_idx++; } /* Calculate the starting mfn for this CPU's memory block. */ start = r->start + (r->per_cpu_sz * cpu_idx) + r->offset; /* Calculate the end mfn into this CPU's memory block for this iteration. */ if ( r->offset + chunk_size >= r->per_cpu_sz ) { end = r->start + (r->per_cpu_sz * cpu_idx) + r->per_cpu_sz; if ( r->rem && (cpumask_weight(&r->cpus) - 1 == cpu_idx) ) end += r->rem; } else end = start + chunk_size; for ( mfn = start; mfn < end; mfn++ ) { pg = mfn_to_page(_mfn(mfn)); /* Check the mfn is valid and page is free. */ if ( !mfn_valid(_mfn(mfn)) || !page_state_is(pg, free) ) continue; scrub_one_page(pg); } } static int __init find_non_smt(unsigned int node, cpumask_t *dest) { cpumask_t node_cpus; unsigned int i, cpu; cpumask_and(&node_cpus, &node_to_cpumask(node), &cpu_online_map); cpumask_clear(dest); for_each_cpu ( i, &node_cpus ) { if ( cpumask_intersects(dest, per_cpu(cpu_sibling_mask, i)) ) continue; cpu = cpumask_first(per_cpu(cpu_sibling_mask, i)); __cpumask_set_cpu(cpu, dest); } return cpumask_weight(dest); } /* * Scrub all unallocated pages in all heap zones. This function uses all * online cpu's to scrub the memory in parallel. */ static void __init scrub_heap_pages(void) { cpumask_t node_cpus, all_worker_cpus; unsigned int i, j; unsigned long offset, max_per_cpu_sz = 0; unsigned long start, end; unsigned long rem = 0; int last_distance, best_node; int cpus; cpumask_clear(&all_worker_cpus); /* Scrub block size. */ chunk_size = opt_bootscrub_chunk >> PAGE_SHIFT; if ( chunk_size == 0 ) chunk_size = MB(128) >> PAGE_SHIFT; /* Round #0 - figure out amounts and which CPUs to use. */ for_each_online_node ( i ) { if ( !node_spanned_pages(i) ) continue; /* Calculate Node memory start and end address. */ start = max(node_start_pfn(i), mfn_x(first_valid_mfn)); end = min(node_start_pfn(i) + node_spanned_pages(i), max_page); /* Just in case NODE has 1 page and starts below first_valid_mfn. */ end = max(end, start); /* CPUs that are online and on this node (if none, that it is OK). */ cpus = find_non_smt(i, &node_cpus); cpumask_or(&all_worker_cpus, &all_worker_cpus, &node_cpus); if ( cpus <= 0 ) { /* No CPUs on this node. Round #2 will take of it. */ rem = 0; region[i].per_cpu_sz = (end - start); } else { rem = (end - start) % cpus; region[i].per_cpu_sz = (end - start) / cpus; if ( region[i].per_cpu_sz > max_per_cpu_sz ) max_per_cpu_sz = region[i].per_cpu_sz; } region[i].start = start; region[i].rem = rem; cpumask_copy(®ion[i].cpus, &node_cpus); } printk("Scrubbing Free RAM on %d nodes using %d CPUs\n", num_online_nodes(), cpumask_weight(&all_worker_cpus)); /* Round: #1 - do NUMA nodes with CPUs. */ for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size ) { for_each_online_node ( i ) region[i].offset = offset; process_pending_softirqs(); spin_lock(&heap_lock); on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, NULL, 1); spin_unlock(&heap_lock); printk("."); } /* * Round #2: NUMA nodes with no CPUs get scrubbed with CPUs on the node * closest to us and with CPUs. */ for_each_online_node ( i ) { node_cpus = node_to_cpumask(i); if ( !cpumask_empty(&node_cpus) ) continue; last_distance = INT_MAX; best_node = first_node(node_online_map); /* Figure out which NODE CPUs are close. */ for_each_online_node ( j ) { u8 distance; if ( cpumask_empty(&node_to_cpumask(j)) ) continue; distance = __node_distance(i, j); if ( (distance < last_distance) && (distance != NUMA_NO_DISTANCE) ) { last_distance = distance; best_node = j; } } /* * Use CPUs from best node, and if there are no CPUs on the * first node (the default) use the BSP. */ cpus = find_non_smt(best_node, &node_cpus); if ( cpus == 0 ) { __cpumask_set_cpu(smp_processor_id(), &node_cpus); cpus = 1; } /* We already have the node information from round #0. */ region[i].rem = region[i].per_cpu_sz % cpus; region[i].per_cpu_sz /= cpus; max_per_cpu_sz = region[i].per_cpu_sz; cpumask_copy(®ion[i].cpus, &node_cpus); for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size ) { region[i].offset = offset; process_pending_softirqs(); spin_lock(&heap_lock); on_selected_cpus(&node_cpus, smp_scrub_heap_pages, ®ion[i], 1); spin_unlock(&heap_lock); printk("."); } } printk("done.\n"); #ifdef CONFIG_SCRUB_DEBUG scrub_debug = true; #endif } void __init heap_init_late(void) { /* * Now that the heap is initialized set bounds * for the low mem virq algorithm. */ setup_low_mem_virq(); switch ( opt_bootscrub ) { default: ASSERT_UNREACHABLE(); /* Fall through */ case BOOTSCRUB_IDLE: printk("Scrubbing Free RAM in background\n"); break; case BOOTSCRUB_ON: scrub_heap_pages(); break; case BOOTSCRUB_OFF: break; } } /************************* * XEN-HEAP SUB-ALLOCATOR */ #if defined(CONFIG_SEPARATE_XENHEAP) void init_xenheap_pages(paddr_t ps, paddr_t pe) { ps = round_pgup(ps); pe = round_pgdown(pe); if ( pe <= ps ) return; /* * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to * prevent merging of power-of-two blocks across the zone boundary. */ if ( ps && !is_xen_heap_mfn(mfn_add(maddr_to_mfn(ps), -1)) ) ps += PAGE_SIZE; if ( !is_xen_heap_mfn(maddr_to_mfn(pe)) ) pe -= PAGE_SIZE; memguard_guard_range(maddr_to_virt(ps), pe - ps); init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT); } void *alloc_xenheap_pages(unsigned int order, unsigned int memflags) { struct page_info *pg; ASSERT(!in_irq()); pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN, order, memflags | MEMF_no_scrub, NULL); if ( unlikely(pg == NULL) ) return NULL; memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT)); return page_to_virt(pg); } void free_xenheap_pages(void *v, unsigned int order) { ASSERT(!in_irq()); if ( v == NULL ) return; memguard_guard_range(v, 1 << (order + PAGE_SHIFT)); free_heap_pages(virt_to_page(v), order, false); } #else /* !CONFIG_SEPARATE_XENHEAP */ void __init xenheap_max_mfn(unsigned long mfn) { ASSERT(!first_node_initialised); ASSERT(!xenheap_bits); BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG); xenheap_bits = min(flsl(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS); printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits); } void init_xenheap_pages(paddr_t ps, paddr_t pe) { init_domheap_pages(ps, pe); } void *alloc_xenheap_pages(unsigned int order, unsigned int memflags) { struct page_info *pg; unsigned int i; ASSERT(!in_irq()); if ( xenheap_bits && (memflags >> _MEMF_bits) > xenheap_bits ) memflags &= ~MEMF_bits(~0U); if ( !(memflags >> _MEMF_bits) ) memflags |= MEMF_bits(xenheap_bits); pg = alloc_domheap_pages(NULL, order, memflags | MEMF_no_scrub); if ( unlikely(pg == NULL) ) return NULL; for ( i = 0; i < (1u << order); i++ ) pg[i].count_info |= PGC_xen_heap; return page_to_virt(pg); } void free_xenheap_pages(void *v, unsigned int order) { struct page_info *pg; unsigned int i; ASSERT(!in_irq()); if ( v == NULL ) return; pg = virt_to_page(v); for ( i = 0; i < (1u << order); i++ ) pg[i].count_info &= ~PGC_xen_heap; free_heap_pages(pg, order, true); } #endif /* CONFIG_SEPARATE_XENHEAP */ /************************* * DOMAIN-HEAP SUB-ALLOCATOR */ void init_domheap_pages(paddr_t ps, paddr_t pe) { mfn_t smfn, emfn; ASSERT(!in_irq()); smfn = maddr_to_mfn(round_pgup(ps)); emfn = maddr_to_mfn(round_pgdown(pe)); if ( mfn_x(emfn) <= mfn_x(smfn) ) return; init_heap_pages(mfn_to_page(smfn), mfn_x(emfn) - mfn_x(smfn)); } int assign_pages( struct domain *d, struct page_info *pg, unsigned int order, unsigned int memflags) { int rc = 0; unsigned long i; spin_lock(&d->page_alloc_lock); if ( unlikely(d->is_dying) ) { gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n", d->domain_id); rc = -EINVAL; goto out; } #ifndef NDEBUG { unsigned int extra_pages = 0; for ( i = 0; i < (1ul << order); i++ ) { ASSERT(!(pg[i].count_info & ~PGC_extra)); if ( pg[i].count_info & PGC_extra ) extra_pages++; } ASSERT(!extra_pages || ((memflags & MEMF_no_refcount) && extra_pages == 1u << order)); } #endif if ( pg[0].count_info & PGC_extra ) { d->extra_pages += 1u << order; memflags &= ~MEMF_no_refcount; } else if ( !(memflags & MEMF_no_refcount) ) { unsigned int tot_pages = domain_tot_pages(d) + (1 << order); if ( unlikely(tot_pages > d->max_pages) ) { gprintk(XENLOG_INFO, "Over-allocation for domain %u: " "%u > %u\n", d->domain_id, tot_pages, d->max_pages); rc = -E2BIG; goto out; } } if ( !(memflags & MEMF_no_refcount) && unlikely(domain_adjust_tot_pages(d, 1 << order) == (1 << order)) ) get_knownalive_domain(d); for ( i = 0; i < (1 << order); i++ ) { ASSERT(page_get_owner(&pg[i]) == NULL); page_set_owner(&pg[i], d); smp_wmb(); /* Domain pointer must be visible before updating refcnt. */ pg[i].count_info = (pg[i].count_info & PGC_extra) | PGC_allocated | 1; page_list_add_tail(&pg[i], page_to_list(d, &pg[i])); } out: spin_unlock(&d->page_alloc_lock); return rc; } struct page_info *alloc_domheap_pages( struct domain *d, unsigned int order, unsigned int memflags) { struct page_info *pg = NULL; unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1; unsigned int dma_zone; ASSERT(!in_irq()); bits = domain_clamp_alloc_bitsize(memflags & MEMF_no_owner ? NULL : d, bits ? : (BITS_PER_LONG+PAGE_SHIFT)); if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 ) return NULL; if ( memflags & MEMF_no_owner ) memflags |= MEMF_no_refcount; if ( !dma_bitsize ) memflags &= ~MEMF_no_dma; else if ( (dma_zone = bits_to_zone(dma_bitsize)) < zone_hi ) pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d); if ( (pg == NULL) && ((memflags & MEMF_no_dma) || ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order, memflags, d)) == NULL)) ) return NULL; if ( d && !(memflags & MEMF_no_owner) ) { if ( memflags & MEMF_no_refcount ) { unsigned long i; for ( i = 0; i < (1ul << order); i++ ) { ASSERT(!pg[i].count_info); pg[i].count_info = PGC_extra; } } if ( assign_pages(d, pg, order, memflags) ) { free_heap_pages(pg, order, memflags & MEMF_no_scrub); return NULL; } } return pg; } void free_domheap_pages(struct page_info *pg, unsigned int order) { struct domain *d = page_get_owner(pg); unsigned int i; bool drop_dom_ref; ASSERT(!in_irq()); if ( unlikely(is_xen_heap_page(pg)) ) { /* NB. May recursively lock from relinquish_memory(). */ spin_lock_recursive(&d->page_alloc_lock); for ( i = 0; i < (1 << order); i++ ) arch_free_heap_page(d, &pg[i]); d->xenheap_pages -= 1 << order; drop_dom_ref = (d->xenheap_pages == 0); spin_unlock_recursive(&d->page_alloc_lock); } else { bool scrub; if ( likely(d) && likely(d != dom_cow) ) { /* NB. May recursively lock from relinquish_memory(). */ spin_lock_recursive(&d->page_alloc_lock); for ( i = 0; i < (1 << order); i++ ) { if ( pg[i].u.inuse.type_info & PGT_count_mask ) { printk(XENLOG_ERR "pg[%u] MFN %"PRI_mfn" c=%#lx o=%u v=%#lx t=%#x\n", i, mfn_x(page_to_mfn(pg + i)), pg[i].count_info, pg[i].v.free.order, pg[i].u.free.val, pg[i].tlbflush_timestamp); BUG(); } arch_free_heap_page(d, &pg[i]); if ( pg[i].count_info & PGC_extra ) { ASSERT(d->extra_pages); d->extra_pages--; } } drop_dom_ref = !domain_adjust_tot_pages(d, -(1 << order)); spin_unlock_recursive(&d->page_alloc_lock); /* * Normally we expect a domain to clear pages before freeing them, * if it cares about the secrecy of their contents. However, after * a domain has died we assume responsibility for erasure. We do * scrub regardless if option scrub_domheap is set. */ scrub = d->is_dying || scrub_debug || opt_scrub_domheap; } else { /* * All we need to check is that on dom_cow only order-0 chunks * make it here. Due to the if() above, the only two possible * cases right now are d == NULL and d == dom_cow. To protect * against relaxation of that if() condition without updating the * check here, don't check d != dom_cow for now. */ ASSERT(!d || !order); drop_dom_ref = false; scrub = 1; } free_heap_pages(pg, order, scrub); } if ( drop_dom_ref ) put_domain(d); } unsigned long avail_domheap_pages_region( unsigned int node, unsigned int min_width, unsigned int max_width) { int zone_lo, zone_hi; zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1); zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo)); zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1); zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi)); return avail_heap_pages(zone_lo, zone_hi, node); } unsigned long avail_domheap_pages(void) { return avail_heap_pages(MEMZONE_XEN + 1, NR_ZONES - 1, -1); } unsigned long avail_node_heap_pages(unsigned int nodeid) { return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid); } static void pagealloc_info(unsigned char key) { unsigned int zone = MEMZONE_XEN; unsigned long n, total = 0; printk("Physical memory information:\n"); printk(" Xen heap: %lukB free\n", avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10)); while ( ++zone < NR_ZONES ) { if ( (zone + PAGE_SHIFT) == dma_bitsize ) { printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10)); total = 0; } if ( (n = avail_heap_pages(zone, zone, -1)) != 0 ) { total += n; printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10)); } } printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10)); } static __init int pagealloc_keyhandler_init(void) { register_keyhandler('m', pagealloc_info, "memory info", 1); return 0; } __initcall(pagealloc_keyhandler_init); void scrub_one_page(struct page_info *pg) { if ( unlikely(pg->count_info & PGC_broken) ) return; #ifndef NDEBUG /* Avoid callers relying on allocations returning zeroed pages. */ unmap_domain_page(memset(__map_domain_page(pg), SCRUB_BYTE_PATTERN, PAGE_SIZE)); #else /* For a production build, clear_page() is the fastest way to scrub. */ clear_domain_page(_mfn(page_to_mfn(pg))); #endif } static void dump_heap(unsigned char key) { s_time_t now = NOW(); int i, j; printk("'%c' pressed -> dumping heap info (now = %"PRI_stime")\n", key, now); for ( i = 0; i < MAX_NUMNODES; i++ ) { if ( !avail[i] ) continue; for ( j = 0; j < NR_ZONES; j++ ) printk("heap[node=%d][zone=%d] -> %lu pages\n", i, j, avail[i][j]); } for ( i = 0; i < MAX_NUMNODES; i++ ) { if ( !node_need_scrub[i] ) continue; printk("Node %d has %lu unscrubbed pages\n", i, node_need_scrub[i]); } } static __init int register_heap_trigger(void) { register_keyhandler('H', dump_heap, "dump heap info", 1); return 0; } __initcall(register_heap_trigger); struct domain *get_pg_owner(domid_t domid) { struct domain *pg_owner = NULL, *curr = current->domain; if ( likely(domid == DOMID_SELF) ) { pg_owner = rcu_lock_current_domain(); goto out; } if ( unlikely(domid == curr->domain_id) ) { gdprintk(XENLOG_WARNING, "Cannot specify itself as foreign domain\n"); goto out; } switch ( domid ) { case DOMID_IO: pg_owner = rcu_lock_domain(dom_io); break; case DOMID_XEN: pg_owner = rcu_lock_domain(dom_xen); break; default: if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL ) gdprintk(XENLOG_WARNING, "Unknown domain d%d\n", domid); break; } out: return pg_owner; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */