/****************************************************************************** * page_alloc.c * * Simple buddy heap allocator for Xen. * * Copyright (c) 2002-2004 K A Fraser * Copyright (c) 2006 IBM Ryan Harper * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CONFIG_X86 #include #include /* for highmem_start only */ #else #define p2m_pod_offline_or_broken_hit(pg) 0 #define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL) #endif /* * Comma-separated list of hexadecimal page numbers containing bad bytes. * e.g. 'badpage=0x3f45,0x8a321'. */ static char __initdata opt_badpage[100] = ""; string_param("badpage", opt_badpage); /* * no-bootscrub -> Free pages are not zeroed during boot. */ static bool_t opt_bootscrub __initdata = 1; boolean_param("bootscrub", opt_bootscrub); /* * Bit width of the DMA heap -- used to override NUMA-node-first. * allocation strategy, which can otherwise exhaust low memory. */ static unsigned int dma_bitsize; integer_param("dma_bits", dma_bitsize); #define round_pgdown(_p) ((_p)&PAGE_MASK) #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) /* Offlined page list, protected by heap_lock. */ PAGE_LIST_HEAD(page_offlined_list); /* Broken page list, protected by heap_lock. */ PAGE_LIST_HEAD(page_broken_list); /************************* * BOOT-TIME ALLOCATOR */ static unsigned long __initdata first_valid_mfn = ~0UL; static struct bootmem_region { unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */ } *__initdata bootmem_region_list; static unsigned int __initdata nr_bootmem_regions; static void __init boot_bug(int line) { panic("Boot BUG at %s:%d\n", __FILE__, line); } #define BOOT_BUG_ON(p) if ( p ) boot_bug(__LINE__); static void __init bootmem_region_add(unsigned long s, unsigned long e) { unsigned int i; if ( (bootmem_region_list == NULL) && (s < e) ) bootmem_region_list = mfn_to_virt(s++); if ( s >= e ) return; for ( i = 0; i < nr_bootmem_regions; i++ ) if ( s < bootmem_region_list[i].e ) break; BOOT_BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s)); BOOT_BUG_ON(nr_bootmem_regions == (PAGE_SIZE / sizeof(struct bootmem_region))); memmove(&bootmem_region_list[i+1], &bootmem_region_list[i], (nr_bootmem_regions - i) * sizeof(*bootmem_region_list)); bootmem_region_list[i] = (struct bootmem_region) { s, e }; nr_bootmem_regions++; } static void __init bootmem_region_zap(unsigned long s, unsigned long e) { unsigned int i; for ( i = 0; i < nr_bootmem_regions; i++ ) { struct bootmem_region *r = &bootmem_region_list[i]; if ( e <= r->s ) break; if ( s >= r->e ) continue; if ( s <= r->s ) { r->s = min(e, r->e); } else if ( e >= r->e ) { r->e = s; } else { unsigned long _e = r->e; r->e = s; bootmem_region_add(e, _e); } } } void __init init_boot_pages(paddr_t ps, paddr_t pe) { unsigned long bad_spfn, bad_epfn; const char *p; #ifdef CONFIG_X86 const unsigned long *badpage = NULL; unsigned int i, array_size; #endif ps = round_pgup(ps); pe = round_pgdown(pe); if ( pe <= ps ) return; first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn); bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT); #ifdef CONFIG_X86 /* * Here we put platform-specific memory range workarounds, i.e. * memory known to be corrupt or otherwise in need to be reserved on * specific platforms. * We get these certain pages and remove them from memory region list. */ badpage = get_platform_badpages(&array_size); if ( badpage ) { for ( i = 0; i < array_size; i++ ) { bootmem_region_zap(*badpage >> PAGE_SHIFT, (*badpage >> PAGE_SHIFT) + 1); badpage++; } } #endif /* Check new pages against the bad-page list. */ p = opt_badpage; while ( *p != '\0' ) { bad_spfn = simple_strtoul(p, &p, 0); bad_epfn = bad_spfn; if ( *p == '-' ) { p++; bad_epfn = simple_strtoul(p, &p, 0); if ( bad_epfn < bad_spfn ) bad_epfn = bad_spfn; } if ( *p == ',' ) p++; else if ( *p != '\0' ) break; bootmem_region_zap(bad_spfn, bad_epfn+1); } } unsigned long __init alloc_boot_pages( unsigned long nr_pfns, unsigned long pfn_align) { unsigned long pg, _e; int i; for ( i = nr_bootmem_regions - 1; i >= 0; i-- ) { struct bootmem_region *r = &bootmem_region_list[i]; pg = (r->e - nr_pfns) & ~(pfn_align - 1); if ( pg < r->s ) continue; #if defined(CONFIG_X86) && !defined(NDEBUG) /* * Filtering pfn_align == 1 since the only allocations using a bigger * alignment are the ones used for setting up the frame table chunks. * Those allocations get remapped anyway, i.e. them not having 1:1 * mappings always accessible is not a problem. */ if ( highmem_start && pfn_align == 1 && r->e > PFN_DOWN(highmem_start) ) { pg = r->s; if ( pg + nr_pfns > PFN_DOWN(highmem_start) ) continue; r->s = pg + nr_pfns; return pg; } #endif _e = r->e; r->e = pg; bootmem_region_add(pg + nr_pfns, _e); return pg; } BOOT_BUG_ON(1); return 0; } /************************* * BINARY BUDDY ALLOCATOR */ #define MEMZONE_XEN 0 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT + 1) #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT)) #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \ (fls(page_to_mfn(pg)) ? : 1)) typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1]; static heap_by_zone_and_order_t *_heap[MAX_NUMNODES]; #define heap(node, zone, order) ((*_heap[node])[zone][order]) static unsigned long *avail[MAX_NUMNODES]; static long total_avail_pages; /* TMEM: Reserve a fraction of memory for mid-size (0page_alloc_lock)); d->tot_pages += pages; /* * can test d->claimed_pages race-free because it can only change * if d->page_alloc_lock and heap_lock are both held, see also * domain_set_outstanding_pages below */ if ( !d->outstanding_pages ) goto out; spin_lock(&heap_lock); /* adjust domain outstanding pages; may not go negative */ dom_before = d->outstanding_pages; dom_after = dom_before - pages; BUG_ON(dom_before < 0); dom_claimed = dom_after < 0 ? 0 : dom_after; d->outstanding_pages = dom_claimed; /* flag accounting bug if system outstanding_claims would go negative */ sys_before = outstanding_claims; sys_after = sys_before - (dom_before - dom_claimed); BUG_ON(sys_after < 0); outstanding_claims = sys_after; spin_unlock(&heap_lock); out: return d->tot_pages; } int domain_set_outstanding_pages(struct domain *d, unsigned long pages) { int ret = -ENOMEM; unsigned long claim, avail_pages; /* * take the domain's page_alloc_lock, else all d->tot_page adjustments * must always take the global heap_lock rather than only in the much * rarer case that d->outstanding_pages is non-zero */ spin_lock(&d->page_alloc_lock); spin_lock(&heap_lock); /* pages==0 means "unset" the claim. */ if ( pages == 0 ) { outstanding_claims -= d->outstanding_pages; d->outstanding_pages = 0; ret = 0; goto out; } /* only one active claim per domain please */ if ( d->outstanding_pages ) { ret = -EINVAL; goto out; } /* disallow a claim not exceeding current tot_pages or above max_pages */ if ( (pages <= d->tot_pages) || (pages > d->max_pages) ) { ret = -EINVAL; goto out; } /* how much memory is available? */ avail_pages = total_avail_pages; /* Note: The usage of claim means that allocation from a guest *might* * have to come from freeable memory. Using free memory is always better, if * it is available, than using freeable memory. * * But that is OK as once the claim has been made, it still can take minutes * before the claim is fully satisfied. Tmem can make use of the unclaimed * pages during this time (to store ephemeral/freeable pages only, * not persistent pages). */ avail_pages += tmem_freeable_pages(); avail_pages -= outstanding_claims; /* * Note, if domain has already allocated memory before making a claim * then the claim must take tot_pages into account */ claim = pages - d->tot_pages; if ( claim > avail_pages ) goto out; /* yay, claim fits in available memory, stake the claim, success! */ d->outstanding_pages = claim; outstanding_claims += d->outstanding_pages; ret = 0; out: spin_unlock(&heap_lock); spin_unlock(&d->page_alloc_lock); return ret; } void get_outstanding_claims(uint64_t *free_pages, uint64_t *outstanding_pages) { spin_lock(&heap_lock); *outstanding_pages = outstanding_claims; *free_pages = avail_domheap_pages(); spin_unlock(&heap_lock); } static unsigned long init_node_heap(int node, unsigned long mfn, unsigned long nr, bool_t *use_tail) { /* First node to be discovered has its heap metadata statically alloced. */ static heap_by_zone_and_order_t _heap_static; static unsigned long avail_static[NR_ZONES]; static int first_node_initialised; unsigned long needed = (sizeof(**_heap) + sizeof(**avail) * NR_ZONES + PAGE_SIZE - 1) >> PAGE_SHIFT; #ifdef DIRECTMAP_VIRT_END unsigned long eva = min(DIRECTMAP_VIRT_END, HYPERVISOR_VIRT_END); #endif int i, j; if ( !first_node_initialised ) { _heap[node] = &_heap_static; avail[node] = avail_static; first_node_initialised = 1; needed = 0; } #ifdef DIRECTMAP_VIRT_END else if ( *use_tail && nr >= needed && (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) ) { _heap[node] = mfn_to_virt(mfn + nr - needed); avail[node] = mfn_to_virt(mfn + nr - 1) + PAGE_SIZE - sizeof(**avail) * NR_ZONES; } else if ( nr >= needed && (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) ) { _heap[node] = mfn_to_virt(mfn); avail[node] = mfn_to_virt(mfn + needed - 1) + PAGE_SIZE - sizeof(**avail) * NR_ZONES; *use_tail = 0; } #endif else if ( get_order_from_bytes(sizeof(**_heap)) == get_order_from_pages(needed) ) { _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0); BUG_ON(!_heap[node]); avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) - sizeof(**avail) * NR_ZONES; needed = 0; } else { _heap[node] = xmalloc(heap_by_zone_and_order_t); avail[node] = xmalloc_array(unsigned long, NR_ZONES); BUG_ON(!_heap[node] || !avail[node]); needed = 0; } memset(avail[node], 0, NR_ZONES * sizeof(long)); for ( i = 0; i < NR_ZONES; i++ ) for ( j = 0; j <= MAX_ORDER; j++ ) INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]); return needed; } /* Default to 64 MiB */ #define DEFAULT_LOW_MEM_VIRQ (((paddr_t) 64) << 20) #define MAX_LOW_MEM_VIRQ (((paddr_t) 1024) << 20) static paddr_t __read_mostly opt_low_mem_virq = ((paddr_t) -1); size_param("low_mem_virq_limit", opt_low_mem_virq); /* Thresholds to control hysteresis. In pages */ /* When memory grows above this threshold, reset hysteresis. * -1 initially to not reset until at least one virq issued. */ static unsigned long low_mem_virq_high = -1UL; /* Threshold at which we issue virq */ static unsigned long low_mem_virq_th = 0; /* Original threshold after all checks completed */ static unsigned long low_mem_virq_orig = 0; /* Order for current threshold */ static unsigned int low_mem_virq_th_order = 0; /* Perform bootstrapping checks and set bounds */ static void __init setup_low_mem_virq(void) { unsigned int order; paddr_t threshold; bool_t halve; /* If the user specifies zero, then he/she doesn't want this virq * to ever trigger. */ if ( opt_low_mem_virq == 0 ) { low_mem_virq_th = -1UL; return; } /* If the user did not specify a knob, remember that */ halve = (opt_low_mem_virq == ((paddr_t) -1)); threshold = halve ? DEFAULT_LOW_MEM_VIRQ : opt_low_mem_virq; /* Dom0 has already been allocated by now. So check we won't be * complaining immediately with whatever's left of the heap. */ threshold = min(threshold, ((paddr_t) total_avail_pages) << PAGE_SHIFT); /* Then, cap to some predefined maximum */ threshold = min(threshold, MAX_LOW_MEM_VIRQ); /* If the user specified no knob, and we are at the current available * level, halve the threshold. */ if ( halve && (threshold == (((paddr_t) total_avail_pages) << PAGE_SHIFT)) ) threshold >>= 1; /* Zero? Have to fire immediately */ threshold = max(threshold, (paddr_t) PAGE_SIZE); /* Threshold bytes -> pages */ low_mem_virq_th = threshold >> PAGE_SHIFT; /* Next, round the threshold down to the next order */ order = get_order_from_pages(low_mem_virq_th); if ( (1UL << order) > low_mem_virq_th ) order--; /* Set bounds, ready to go */ low_mem_virq_th = low_mem_virq_orig = 1UL << order; low_mem_virq_th_order = order; printk("Initial low memory virq threshold set at %#lx pages.\n", low_mem_virq_th); } static void check_low_mem_virq(void) { unsigned long avail_pages = total_avail_pages + (opt_tmem ? tmem_freeable_pages() : 0) - outstanding_claims; if ( unlikely(avail_pages <= low_mem_virq_th) ) { send_global_virq(VIRQ_ENOMEM); /* Update thresholds. Next warning will be when we drop below * next order. However, we wait until we grow beyond one * order above us to complain again at the current order */ low_mem_virq_high = 1UL << (low_mem_virq_th_order + 1); if ( low_mem_virq_th_order > 0 ) low_mem_virq_th_order--; low_mem_virq_th = 1UL << low_mem_virq_th_order; return; } if ( unlikely(avail_pages >= low_mem_virq_high) ) { /* Reset hysteresis. Bring threshold up one order. * If we are back where originally set, set high * threshold to -1 to avoid further growth of * virq threshold. */ low_mem_virq_th_order++; low_mem_virq_th = 1UL << low_mem_virq_th_order; if ( low_mem_virq_th == low_mem_virq_orig ) low_mem_virq_high = -1UL; else low_mem_virq_high = 1UL << (low_mem_virq_th_order + 2); } } /* Allocate 2^@order contiguous pages. */ static struct page_info *alloc_heap_pages( unsigned int zone_lo, unsigned int zone_hi, unsigned int order, unsigned int memflags, struct domain *d) { unsigned int first_node, i, j, zone = 0, nodemask_retry = 0; unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1); unsigned long request = 1UL << order; struct page_info *pg; nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map; bool_t need_tlbflush = 0; uint32_t tlbflush_timestamp = 0; if ( node == NUMA_NO_NODE ) { memflags &= ~MEMF_exact_node; if ( d != NULL ) { node = next_node(d->last_alloc_node, nodemask); if ( node >= MAX_NUMNODES ) node = first_node(nodemask); } if ( node >= MAX_NUMNODES ) node = cpu_to_node(smp_processor_id()); } first_node = node; ASSERT(node >= 0); ASSERT(zone_lo <= zone_hi); ASSERT(zone_hi < NR_ZONES); if ( unlikely(order > MAX_ORDER) ) return NULL; spin_lock(&heap_lock); /* * Claimed memory is considered unavailable unless the request * is made by a domain with sufficient unclaimed pages. */ if ( (outstanding_claims + request > total_avail_pages + tmem_freeable_pages()) && (d == NULL || d->outstanding_pages < request) ) goto not_found; /* * TMEM: When available memory is scarce due to tmem absorbing it, allow * only mid-size allocations to avoid worst of fragmentation issues. * Others try tmem pools then fail. This is a workaround until all * post-dom0-creation-multi-page allocations can be eliminated. */ if ( opt_tmem && ((order == 0) || (order >= 9)) && (total_avail_pages <= midsize_alloc_zone_pages) && tmem_freeable_pages() ) goto try_tmem; /* * Start with requested node, but exhaust all node memory in requested * zone before failing, only calc new node value if we fail to find memory * in target node, this avoids needless computation on fast-path. */ for ( ; ; ) { zone = zone_hi; do { /* Check if target node can support the allocation. */ if ( !avail[node] || (avail[node][zone] < request) ) continue; /* Find smallest order which can satisfy the request. */ for ( j = order; j <= MAX_ORDER; j++ ) if ( (pg = page_list_remove_head(&heap(node, zone, j))) ) goto found; } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */ if ( memflags & MEMF_exact_node ) goto not_found; /* Pick next node. */ if ( !node_isset(node, nodemask) ) { /* Very first node may be caller-specified and outside nodemask. */ ASSERT(!nodemask_retry); first_node = node = first_node(nodemask); if ( node < MAX_NUMNODES ) continue; } else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES ) node = first_node(nodemask); if ( node == first_node ) { /* When we have tried all in nodemask, we fall back to others. */ if ( nodemask_retry++ ) goto not_found; nodes_andnot(nodemask, node_online_map, nodemask); first_node = node = first_node(nodemask); if ( node >= MAX_NUMNODES ) goto not_found; } } try_tmem: /* Try to free memory from tmem */ if ( (pg = tmem_relinquish_pages(order, memflags)) != NULL ) { /* reassigning an already allocated anonymous heap page */ spin_unlock(&heap_lock); return pg; } not_found: /* No suitable memory blocks. Fail the request. */ spin_unlock(&heap_lock); return NULL; found: /* We may have to halve the chunk a number of times. */ while ( j != order ) { PFN_ORDER(pg) = --j; page_list_add_tail(pg, &heap(node, zone, j)); pg += 1 << j; } ASSERT(avail[node][zone] >= request); avail[node][zone] -= request; total_avail_pages -= request; ASSERT(total_avail_pages >= 0); check_low_mem_virq(); if ( d != NULL ) d->last_alloc_node = node; for ( i = 0; i < (1 << order); i++ ) { /* Reference count must continuously be zero for free pages. */ BUG_ON(pg[i].count_info != PGC_state_free); pg[i].count_info = PGC_state_inuse; if ( pg[i].u.free.need_tlbflush && (pg[i].tlbflush_timestamp <= tlbflush_current_time()) && (!need_tlbflush || (pg[i].tlbflush_timestamp > tlbflush_timestamp)) ) { need_tlbflush = 1; tlbflush_timestamp = pg[i].tlbflush_timestamp; } /* Initialise fields which have other uses for free pages. */ pg[i].u.inuse.type_info = 0; page_set_owner(&pg[i], NULL); } spin_unlock(&heap_lock); if ( need_tlbflush ) { cpumask_t mask = cpu_online_map; tlbflush_filter(mask, tlbflush_timestamp); if ( !cpumask_empty(&mask) ) { perfc_incr(need_flush_tlb_flush); flush_tlb_mask(&mask); } } return pg; } /* Remove any offlined page in the buddy pointed to by head. */ static int reserve_offlined_page(struct page_info *head) { unsigned int node = phys_to_nid(page_to_maddr(head)); int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0; struct page_info *cur_head; int cur_order; ASSERT(spin_is_locked(&heap_lock)); cur_head = head; page_list_del(head, &heap(node, zone, head_order)); while ( cur_head < (head + (1 << head_order)) ) { struct page_info *pg; int next_order; if ( page_state_is(cur_head, offlined) ) { cur_head++; continue; } next_order = cur_order = 0; while ( cur_order < head_order ) { next_order = cur_order + 1; if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) ) goto merge; for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order ); i < (1 << next_order); i++, pg++ ) if ( page_state_is(pg, offlined) ) break; if ( i == ( 1 << next_order) ) { cur_order = next_order; continue; } else { merge: /* We don't consider merging outside the head_order. */ page_list_add_tail(cur_head, &heap(node, zone, cur_order)); PFN_ORDER(cur_head) = cur_order; cur_head += (1 << cur_order); break; } } } for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ ) { if ( !page_state_is(cur_head, offlined) ) continue; avail[node][zone]--; total_avail_pages--; ASSERT(total_avail_pages >= 0); page_list_add_tail(cur_head, test_bit(_PGC_broken, &cur_head->count_info) ? &page_broken_list : &page_offlined_list); count++; } return count; } /* Free 2^@order set of pages. */ static void free_heap_pages( struct page_info *pg, unsigned int order) { unsigned long mask, mfn = page_to_mfn(pg); unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0; unsigned int zone = page_to_zone(pg); ASSERT(order <= MAX_ORDER); ASSERT(node >= 0); spin_lock(&heap_lock); for ( i = 0; i < (1 << order); i++ ) { /* * Cannot assume that count_info == 0, as there are some corner cases * where it isn't the case and yet it isn't a bug: * 1. page_get_owner() is NULL * 2. page_get_owner() is a domain that was never accessible by * its domid (e.g., failed to fully construct the domain). * 3. page was never addressable by the guest (e.g., it's an * auto-translate-physmap guest and the page was never included * in its pseudophysical address space). * In all the above cases there can be no guest mappings of this page. */ ASSERT(!page_state_is(&pg[i], offlined)); pg[i].count_info = ((pg[i].count_info & PGC_broken) | (page_state_is(&pg[i], offlining) ? PGC_state_offlined : PGC_state_free)); if ( page_state_is(&pg[i], offlined) ) tainted = 1; /* If a page has no owner it will need no safety TLB flush. */ pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL); if ( pg[i].u.free.need_tlbflush ) pg[i].tlbflush_timestamp = tlbflush_current_time(); /* This page is not a guest frame any more. */ page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */ set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY); } avail[node][zone] += 1 << order; total_avail_pages += 1 << order; if ( opt_tmem ) midsize_alloc_zone_pages = max( midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC); /* Merge chunks as far as possible. */ while ( order < MAX_ORDER ) { mask = 1UL << order; if ( (page_to_mfn(pg) & mask) ) { /* Merge with predecessor block? */ if ( !mfn_valid(page_to_mfn(pg-mask)) || !page_state_is(pg-mask, free) || (PFN_ORDER(pg-mask) != order) || (phys_to_nid(page_to_maddr(pg-mask)) != node) ) break; pg -= mask; page_list_del(pg, &heap(node, zone, order)); } else { /* Merge with successor block? */ if ( !mfn_valid(page_to_mfn(pg+mask)) || !page_state_is(pg+mask, free) || (PFN_ORDER(pg+mask) != order) || (phys_to_nid(page_to_maddr(pg+mask)) != node) ) break; page_list_del(pg + mask, &heap(node, zone, order)); } order++; } PFN_ORDER(pg) = order; page_list_add_tail(pg, &heap(node, zone, order)); if ( tainted ) reserve_offlined_page(pg); spin_unlock(&heap_lock); } /* * Following rules applied for page offline: * Once a page is broken, it can't be assigned anymore * A page will be offlined only if it is free * return original count_info */ static unsigned long mark_page_offline(struct page_info *pg, int broken) { unsigned long nx, x, y = pg->count_info; ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL)); ASSERT(spin_is_locked(&heap_lock)); do { nx = x = y; if ( ((x & PGC_state) != PGC_state_offlined) && ((x & PGC_state) != PGC_state_offlining) ) { nx &= ~PGC_state; nx |= (((x & PGC_state) == PGC_state_free) ? PGC_state_offlined : PGC_state_offlining); } if ( broken ) nx |= PGC_broken; if ( x == nx ) break; } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x ); return y; } static int reserve_heap_page(struct page_info *pg) { struct page_info *head = NULL; unsigned int i, node = phys_to_nid(page_to_maddr(pg)); unsigned int zone = page_to_zone(pg); for ( i = 0; i <= MAX_ORDER; i++ ) { struct page_info *tmp; if ( page_list_empty(&heap(node, zone, i)) ) continue; page_list_for_each_safe ( head, tmp, &heap(node, zone, i) ) { if ( (head <= pg) && (head + (1UL << i) > pg) ) return reserve_offlined_page(head); } } return -EINVAL; } int offline_page(unsigned long mfn, int broken, uint32_t *status) { unsigned long old_info = 0; struct domain *owner; int ret = 0; struct page_info *pg; if ( !mfn_valid(mfn) ) { dprintk(XENLOG_WARNING, "try to offline page out of range %lx\n", mfn); return -EINVAL; } *status = 0; pg = mfn_to_page(mfn); if ( is_xen_fixed_mfn(mfn) ) { *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED | (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT); return -EPERM; } /* * N.B. xen's txt in x86_64 is marked reserved and handled already. * Also kexec range is reserved. */ if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) ) { *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM; return -EINVAL; } /* * NB. When broken page belong to guest, usually hypervisor will * notify the guest to handle the broken page. However, hypervisor * need to prevent malicious guest access the broken page again. * Under such case, hypervisor shutdown guest, preventing recursive mce. */ if ( (pg->count_info & PGC_broken) && (owner = page_get_owner(pg)) ) { *status = PG_OFFLINE_AGAIN; domain_shutdown(owner, SHUTDOWN_crash); return 0; } spin_lock(&heap_lock); old_info = mark_page_offline(pg, broken); if ( page_state_is(pg, offlined) ) { reserve_heap_page(pg); *status = PG_OFFLINE_OFFLINED; } else if ( (owner = page_get_owner_and_reference(pg)) ) { if ( p2m_pod_offline_or_broken_hit(pg) ) goto pod_replace; else { *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING | (owner->domain_id << PG_OFFLINE_OWNER_SHIFT); /* Release the reference since it will not be allocated anymore */ put_page(pg); } } else if ( old_info & PGC_xen_heap ) { *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING | (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT); } else { /* * assign_pages does not hold heap_lock, so small window that the owner * may be set later, but please notice owner will only change from * NULL to be set, not verse, since page is offlining now. * No windows If called from #MC handler, since all CPU are in softirq * If called from user space like CE handling, tools can wait some time * before call again. */ *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED | (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT ); } if ( broken ) *status |= PG_OFFLINE_BROKEN; spin_unlock(&heap_lock); return ret; pod_replace: put_page(pg); spin_unlock(&heap_lock); p2m_pod_offline_or_broken_replace(pg); *status = PG_OFFLINE_OFFLINED; if ( broken ) *status |= PG_OFFLINE_BROKEN; return ret; } /* * Online the memory. * The caller should make sure end_pfn <= max_page, * if not, expand_pages() should be called prior to online_page(). */ unsigned int online_page(unsigned long mfn, uint32_t *status) { unsigned long x, nx, y; struct page_info *pg; int ret; if ( !mfn_valid(mfn) ) { dprintk(XENLOG_WARNING, "call expand_pages() first\n"); return -EINVAL; } pg = mfn_to_page(mfn); spin_lock(&heap_lock); y = pg->count_info; do { ret = *status = 0; if ( y & PGC_broken ) { ret = -EINVAL; *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN; break; } if ( (y & PGC_state) == PGC_state_offlined ) { page_list_del(pg, &page_offlined_list); *status = PG_ONLINE_ONLINED; } else if ( (y & PGC_state) == PGC_state_offlining ) { *status = PG_ONLINE_ONLINED; } else { break; } x = y; nx = (x & ~PGC_state) | PGC_state_inuse; } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x ); spin_unlock(&heap_lock); if ( (y & PGC_state) == PGC_state_offlined ) free_heap_pages(pg, 0); return ret; } int query_page_offline(unsigned long mfn, uint32_t *status) { struct page_info *pg; if ( !mfn_valid(mfn) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) ) { dprintk(XENLOG_WARNING, "call expand_pages() first\n"); return -EINVAL; } *status = 0; spin_lock(&heap_lock); pg = mfn_to_page(mfn); if ( page_state_is(pg, offlining) ) *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING; if ( pg->count_info & PGC_broken ) *status |= PG_OFFLINE_STATUS_BROKEN; if ( page_state_is(pg, offlined) ) *status |= PG_OFFLINE_STATUS_OFFLINED; spin_unlock(&heap_lock); return 0; } /* * Hand the specified arbitrary page range to the specified heap zone * checking the node_id of the previous page. If they differ and the * latter is not on a MAX_ORDER boundary, then we reserve the page by * not freeing it to the buddy allocator. */ static void init_heap_pages( struct page_info *pg, unsigned long nr_pages) { unsigned long i; for ( i = 0; i < nr_pages; i++ ) { unsigned int nid = phys_to_nid(page_to_maddr(pg+i)); if ( unlikely(!avail[nid]) ) { unsigned long s = page_to_mfn(pg + i); unsigned long e = page_to_mfn(pg + nr_pages - 1) + 1; bool_t use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) && !(s & ((1UL << MAX_ORDER) - 1)) && (find_first_set_bit(e) <= find_first_set_bit(s)); unsigned long n; n = init_node_heap(nid, page_to_mfn(pg+i), nr_pages - i, &use_tail); BUG_ON(i + n > nr_pages); if ( n && !use_tail ) { i += n - 1; continue; } if ( i + n == nr_pages ) break; nr_pages -= n; } free_heap_pages(pg+i, 0); } } static unsigned long avail_heap_pages( unsigned int zone_lo, unsigned int zone_hi, unsigned int node) { unsigned int i, zone; unsigned long free_pages = 0; if ( zone_hi >= NR_ZONES ) zone_hi = NR_ZONES - 1; for_each_online_node(i) { if ( !avail[i] ) continue; for ( zone = zone_lo; zone <= zone_hi; zone++ ) if ( (node == -1) || (node == i) ) free_pages += avail[i][zone]; } return free_pages; } unsigned long total_free_pages(void) { return total_avail_pages - midsize_alloc_zone_pages; } void __init end_boot_allocator(void) { unsigned int i; /* Pages that are free now go to the domain sub-allocator. */ for ( i = 0; i < nr_bootmem_regions; i++ ) { struct bootmem_region *r = &bootmem_region_list[i]; if ( (r->s < r->e) && (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) ) { init_heap_pages(mfn_to_page(r->s), r->e - r->s); r->e = r->s; break; } } for ( i = nr_bootmem_regions; i-- > 0; ) { struct bootmem_region *r = &bootmem_region_list[i]; if ( r->s < r->e ) init_heap_pages(mfn_to_page(r->s), r->e - r->s); } init_heap_pages(virt_to_page(bootmem_region_list), 1); if ( !dma_bitsize && (num_online_nodes() > 1) ) { #ifdef CONFIG_X86 dma_bitsize = min_t(unsigned int, fls(NODE_DATA(0)->node_spanned_pages) - 1 + PAGE_SHIFT - 2, 32); #else dma_bitsize = 32; #endif } printk("Domain heap initialised"); if ( dma_bitsize ) printk(" DMA width %u bits", dma_bitsize); printk("\n"); } /* * Scrub all unallocated pages in all heap zones. This function is more * convoluted than appears necessary because we do not want to continuously * hold the lock while scrubbing very large memory areas. */ void __init scrub_heap_pages(void) { unsigned long mfn; struct page_info *pg; if ( !opt_bootscrub ) return; printk("Scrubbing Free RAM: "); for ( mfn = first_valid_mfn; mfn < max_page; mfn++ ) { process_pending_softirqs(); pg = mfn_to_page(mfn); /* Quick lock-free check. */ if ( !mfn_valid(mfn) || !page_state_is(pg, free) ) continue; /* Every 100MB, print a progress dot. */ if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 ) printk("."); spin_lock(&heap_lock); /* Re-check page status with lock held. */ if ( page_state_is(pg, free) ) scrub_one_page(pg); spin_unlock(&heap_lock); } printk("done.\n"); /* Now that the heap is initialized, run checks and set bounds * for the low mem virq algorithm. */ setup_low_mem_virq(); } /************************* * XEN-HEAP SUB-ALLOCATOR */ #if defined(CONFIG_SEPARATE_XENHEAP) void init_xenheap_pages(paddr_t ps, paddr_t pe) { ps = round_pgup(ps); pe = round_pgdown(pe); if ( pe <= ps ) return; /* * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to * prevent merging of power-of-two blocks across the zone boundary. */ if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) ) ps += PAGE_SIZE; if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) ) pe -= PAGE_SIZE; memguard_guard_range(maddr_to_virt(ps), pe - ps); init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT); } void *alloc_xenheap_pages(unsigned int order, unsigned int memflags) { struct page_info *pg; ASSERT(!in_irq()); pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN, order, memflags, NULL); if ( unlikely(pg == NULL) ) return NULL; memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT)); return page_to_virt(pg); } void free_xenheap_pages(void *v, unsigned int order) { ASSERT(!in_irq()); if ( v == NULL ) return; memguard_guard_range(v, 1 << (order + PAGE_SHIFT)); free_heap_pages(virt_to_page(v), order); } #else static unsigned int __read_mostly xenheap_bits; void __init xenheap_max_mfn(unsigned long mfn) { xenheap_bits = fls(mfn) + PAGE_SHIFT - 1; } void init_xenheap_pages(paddr_t ps, paddr_t pe) { init_domheap_pages(ps, pe); } void *alloc_xenheap_pages(unsigned int order, unsigned int memflags) { struct page_info *pg; unsigned int i; ASSERT(!in_irq()); if ( xenheap_bits && (memflags >> _MEMF_bits) > xenheap_bits ) memflags &= ~MEMF_bits(~0); if ( !(memflags >> _MEMF_bits) ) memflags |= MEMF_bits(xenheap_bits); pg = alloc_domheap_pages(NULL, order, memflags); if ( unlikely(pg == NULL) ) return NULL; for ( i = 0; i < (1u << order); i++ ) pg[i].count_info |= PGC_xen_heap; return page_to_virt(pg); } void free_xenheap_pages(void *v, unsigned int order) { struct page_info *pg; unsigned int i; ASSERT(!in_irq()); if ( v == NULL ) return; pg = virt_to_page(v); for ( i = 0; i < (1u << order); i++ ) pg[i].count_info &= ~PGC_xen_heap; free_heap_pages(pg, order); } #endif /************************* * DOMAIN-HEAP SUB-ALLOCATOR */ void init_domheap_pages(paddr_t ps, paddr_t pe) { unsigned long smfn, emfn; ASSERT(!in_irq()); smfn = round_pgup(ps) >> PAGE_SHIFT; emfn = round_pgdown(pe) >> PAGE_SHIFT; init_heap_pages(mfn_to_page(smfn), emfn - smfn); } int assign_pages( struct domain *d, struct page_info *pg, unsigned int order, unsigned int memflags) { unsigned long i; spin_lock(&d->page_alloc_lock); if ( unlikely(d->is_dying) ) { gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n", d->domain_id); goto fail; } if ( !(memflags & MEMF_no_refcount) ) { if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) ) { if ( !opt_tmem || order != 0 || d->tot_pages != d->max_pages ) gdprintk(XENLOG_INFO, "Over-allocation for domain %u: " "%u > %u\n", d->domain_id, d->tot_pages + (1 << order), d->max_pages); goto fail; } if ( unlikely(d->tot_pages == 0) ) get_knownalive_domain(d); domain_adjust_tot_pages(d, 1 << order); } for ( i = 0; i < (1 << order); i++ ) { ASSERT(page_get_owner(&pg[i]) == NULL); ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0); page_set_owner(&pg[i], d); smp_wmb(); /* Domain pointer must be visible before updating refcnt. */ pg[i].count_info = PGC_allocated | 1; page_list_add_tail(&pg[i], &d->page_list); } spin_unlock(&d->page_alloc_lock); return 0; fail: spin_unlock(&d->page_alloc_lock); return -1; } struct page_info *alloc_domheap_pages( struct domain *d, unsigned int order, unsigned int memflags) { struct page_info *pg = NULL; unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1; unsigned int dma_zone; ASSERT(!in_irq()); bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT)); if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 ) return NULL; if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) ) pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d); if ( (pg == NULL) && ((memflags & MEMF_no_dma) || ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order, memflags, d)) == NULL)) ) return NULL; if ( (d != NULL) && assign_pages(d, pg, order, memflags) ) { free_heap_pages(pg, order); return NULL; } return pg; } void free_domheap_pages(struct page_info *pg, unsigned int order) { int i, drop_dom_ref; struct domain *d = page_get_owner(pg); ASSERT(!in_irq()); if ( unlikely(is_xen_heap_page(pg)) ) { /* NB. May recursively lock from relinquish_memory(). */ spin_lock_recursive(&d->page_alloc_lock); for ( i = 0; i < (1 << order); i++ ) page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list); d->xenheap_pages -= 1 << order; drop_dom_ref = (d->xenheap_pages == 0); spin_unlock_recursive(&d->page_alloc_lock); } else if ( likely(d != NULL) && likely(d != dom_cow) ) { /* NB. May recursively lock from relinquish_memory(). */ spin_lock_recursive(&d->page_alloc_lock); for ( i = 0; i < (1 << order); i++ ) { BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0); page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list); } domain_adjust_tot_pages(d, -(1 << order)); drop_dom_ref = (d->tot_pages == 0); spin_unlock_recursive(&d->page_alloc_lock); /* * Normally we expect a domain to clear pages before freeing them, if * it cares about the secrecy of their contents. However, after a * domain has died we assume responsibility for erasure. */ if ( unlikely(d->is_dying) ) for ( i = 0; i < (1 << order); i++ ) scrub_one_page(&pg[i]); free_heap_pages(pg, order); } else if ( unlikely(d == dom_cow) ) { ASSERT(order == 0); scrub_one_page(pg); free_heap_pages(pg, 0); drop_dom_ref = 0; } else { /* Freeing anonymous domain-heap pages. */ free_heap_pages(pg, order); drop_dom_ref = 0; } if ( drop_dom_ref ) put_domain(d); } unsigned long avail_domheap_pages_region( unsigned int node, unsigned int min_width, unsigned int max_width) { int zone_lo, zone_hi; zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1); zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo)); zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1); zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi)); return avail_heap_pages(zone_lo, zone_hi, node); } unsigned long avail_domheap_pages(void) { return avail_heap_pages(MEMZONE_XEN + 1, NR_ZONES - 1, -1); } unsigned long avail_node_heap_pages(unsigned int nodeid) { return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid); } static void pagealloc_info(unsigned char key) { unsigned int zone = MEMZONE_XEN; unsigned long n, total = 0; printk("Physical memory information:\n"); printk(" Xen heap: %lukB free\n", avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10)); while ( ++zone < NR_ZONES ) { if ( (zone + PAGE_SHIFT) == dma_bitsize ) { printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10)); total = 0; } if ( (n = avail_heap_pages(zone, zone, -1)) != 0 ) { total += n; printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10)); } } printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10)); } static struct keyhandler pagealloc_info_keyhandler = { .diagnostic = 1, .u.fn = pagealloc_info, .desc = "memory info" }; static __init int pagealloc_keyhandler_init(void) { register_keyhandler('m', &pagealloc_info_keyhandler); return 0; } __initcall(pagealloc_keyhandler_init); void scrub_one_page(struct page_info *pg) { void *p = __map_domain_page(pg); if ( unlikely(pg->count_info & PGC_broken) ) return; #ifndef NDEBUG /* Avoid callers relying on allocations returning zeroed pages. */ memset(p, 0xc2, PAGE_SIZE); #else /* For a production build, clear_page() is the fastest way to scrub. */ clear_page(p); #endif unmap_domain_page(p); } static void dump_heap(unsigned char key) { s_time_t now = NOW(); int i, j; printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key, (u32)(now>>32), (u32)now); for ( i = 0; i < MAX_NUMNODES; i++ ) { if ( !avail[i] ) continue; for ( j = 0; j < NR_ZONES; j++ ) printk("heap[node=%d][zone=%d] -> %lu pages\n", i, j, avail[i][j]); } } static struct keyhandler dump_heap_keyhandler = { .diagnostic = 1, .u.fn = dump_heap, .desc = "dump heap info" }; static __init int register_heap_trigger(void) { register_keyhandler('H', &dump_heap_keyhandler); return 0; } __initcall(register_heap_trigger); /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */