diff options
author | iap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk> | 2004-05-24 18:13:06 +0000 |
---|---|---|
committer | iap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk> | 2004-05-24 18:13:06 +0000 |
commit | 84b3199c7847e50cb3d777127327674b21540801 (patch) | |
tree | a31f8c33c41b7ba3f66c16fa65b7f9895540c5a6 | |
parent | fdb0b778782b5a8606b37689bbfbe004fdac2722 (diff) | |
download | xen-84b3199c7847e50cb3d777127327674b21540801.tar.gz xen-84b3199c7847e50cb3d777127327674b21540801.tar.bz2 xen-84b3199c7847e50cb3d777127327674b21540801.zip |
bitkeeper revision 1.921 (40b23b32vMbvKCdgtCukVpQP62ZvYA)
Add more stats to migration code, scan pages in psuedo random permutation,
misc cleanups.
-rw-r--r-- | tools/xc/lib/xc.h | 4 | ||||
-rw-r--r-- | tools/xc/lib/xc_domain.c | 7 | ||||
-rw-r--r-- | tools/xc/lib/xc_linux_restore.c | 109 | ||||
-rw-r--r-- | tools/xc/lib/xc_linux_save.c | 156 | ||||
-rw-r--r-- | tools/xc/lib/xc_private.c | 171 | ||||
-rw-r--r-- | tools/xc/py/Xc.c | 4 | ||||
-rw-r--r-- | xen/common/dom0_ops.c | 14 | ||||
-rw-r--r-- | xen/common/shadow.c | 41 | ||||
-rw-r--r-- | xen/include/asm-i386/processor.h | 16 | ||||
-rw-r--r-- | xen/include/hypervisor-ifs/dom0_ops.h | 13 | ||||
-rw-r--r-- | xen/include/xen/shadow.h | 8 | ||||
-rw-r--r-- | xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c | 2 |
12 files changed, 260 insertions, 285 deletions
diff --git a/tools/xc/lib/xc.h b/tools/xc/lib/xc.h index 223710fad1..3423c4ec46 100644 --- a/tools/xc/lib/xc.h +++ b/tools/xc/lib/xc.h @@ -59,7 +59,9 @@ int xc_shadow_control(int xc_handle, u64 domid, unsigned int sop, unsigned long *dirty_bitmap, - unsigned long pages); + unsigned long pages, + unsigned long *fault_count, + unsigned long *dirty_count); #define XCFLAGS_VERBOSE 1 diff --git a/tools/xc/lib/xc_domain.c b/tools/xc/lib/xc_domain.c index 97b2b26f6d..dd8b4ec35f 100644 --- a/tools/xc/lib/xc_domain.c +++ b/tools/xc/lib/xc_domain.c @@ -112,7 +112,9 @@ int xc_shadow_control(int xc_handle, u64 domid, unsigned int sop, unsigned long *dirty_bitmap, - unsigned long pages) + unsigned long pages, + unsigned long *fault_count, + unsigned long *dirty_count) { int rc; dom0_op_t op; @@ -124,6 +126,9 @@ int xc_shadow_control(int xc_handle, rc = do_dom0_op(xc_handle, &op); + if(fault_count) *fault_count = op.u.shadow_control.fault_count; + if(dirty_count) *dirty_count = op.u.shadow_control.dirty_count; + if ( rc == 0 ) return op.u.shadow_control.pages; else diff --git a/tools/xc/lib/xc_linux_restore.c b/tools/xc/lib/xc_linux_restore.c index 4e89b5715f..e756ad6ffd 100644 --- a/tools/xc/lib/xc_linux_restore.c +++ b/tools/xc/lib/xc_linux_restore.c @@ -88,6 +88,9 @@ int xc_linux_restore(int xc_handle, /* A table containg the type of each PFN (/not/ MFN!). */ unsigned long *pfn_type = NULL; + /* A table of MFNs to map in the current region */ + unsigned long *region_mfn = NULL; + /* A temporary mapping, and a copy, of one frame of guest memory. */ unsigned long *ppage; @@ -97,10 +100,12 @@ int xc_linux_restore(int xc_handle, /* A table mapping each PFN to its new MFN. */ unsigned long *pfn_to_mfn_table = NULL; + /* used by mapper for updating the domain's copy of the table */ + unsigned long *live_pfn_to_mfn_table = NULL; + /* A temporary mapping of the guest's suspend record. */ suspend_record_t *p_srec; - mfn_mapper_t *region_mapper, *mapper_handle1; char *region_base; mmu_t *mmu = NULL; @@ -154,12 +159,20 @@ int xc_linux_restore(int xc_handle, /* We want zeroed memory so use calloc rather than malloc. */ pfn_to_mfn_table = calloc(1, 4 * nr_pfns); pfn_type = calloc(1, 4 * nr_pfns); + region_mfn = calloc(1, 4 * MAX_BATCH_SIZE); - if ( (pfn_to_mfn_table == NULL) || (pfn_type == NULL) ) + if ( (pfn_to_mfn_table == NULL) || (pfn_type == NULL) || + (region_mfn == NULL) ) { errno = ENOMEM; goto out; } + + if ( mlock(region_mfn, 4 * MAX_BATCH_SIZE ) ) + { + ERROR("Could not mlock region_mfn"); + goto out; + } /* Set the domain's name to that from the restore file */ if ( xc_domain_setname( xc_handle, dom, name ) ) @@ -206,15 +219,6 @@ int xc_linux_restore(int xc_handle, goto out; } - - if ( (region_mapper = mfn_mapper_init(xc_handle, dom, - MAX_BATCH_SIZE*PAGE_SIZE, - PROT_WRITE )) - == NULL ) - goto out; - - region_base = mfn_mapper_base( region_mapper ); - verbose_printf("Reloading memory pages: 0%%"); /* @@ -227,7 +231,7 @@ int xc_linux_restore(int xc_handle, while(1) { int j; - unsigned long region_pfn_type[1024]; + unsigned long region_pfn_type[MAX_BATCH_SIZE]; this_pc = (n * 100) / nr_pfns; if ( (this_pc - prev_pc) >= 5 ) @@ -270,30 +274,31 @@ int xc_linux_restore(int xc_handle, for(i=0;i<j;i++) { - if ((region_pfn_type[i]>>29) == 7) - continue; - - pfn = region_pfn_type[i] & ~PGT_type_mask; - mfn = pfn_to_mfn_table[pfn]; - - mfn_mapper_queue_entry( region_mapper, i<<PAGE_SHIFT, - mfn, PAGE_SIZE ); + if ( (region_pfn_type[i] & LTAB_MASK) == XTAB) + region_mfn[i] = 0; // we know map will fail, but don't care + else + { + pfn = region_pfn_type[i] & ~LTAB_MASK; + region_mfn[i] = pfn_to_mfn_table[pfn]; + } } - - if( mfn_mapper_flush_queue(region_mapper) ) + + if ( (region_base = mfn_mapper_map_batch( xc_handle, dom, + PROT_WRITE, + region_mfn, + j )) == 0) { - ERROR("Couldn't map page region"); + PERROR("map batch failed"); goto out; } - for(i=0;i<j;i++) { unsigned long *ppage; - pfn = region_pfn_type[i] & ~PGT_type_mask; + pfn = region_pfn_type[i] & ~LTAB_MASK; - if ((region_pfn_type[i]>>29) == 7) + if ( (region_pfn_type[i] & LTAB_MASK) == XTAB) continue; if (pfn>nr_pfns) @@ -302,7 +307,7 @@ int xc_linux_restore(int xc_handle, goto out; } - region_pfn_type[i] &= PGT_type_mask; + region_pfn_type[i] &= LTAB_MASK; pfn_type[pfn] = region_pfn_type[i]; @@ -334,7 +339,7 @@ int xc_linux_restore(int xc_handle, if ( xpfn >= nr_pfns ) { - ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=0x%x nr_pfns=%d",region_pfn_type[i]>>29,i,k,xpfn,nr_pfns); + ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=0x%x nr_pfns=%d",region_pfn_type[i]>>28,i,k,xpfn,nr_pfns); goto out; } @@ -355,17 +360,11 @@ int xc_linux_restore(int xc_handle, if ( xpfn >= nr_pfns ) { - ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=%d nr_pfns=%d",region_pfn_type[i]>>29,i,k,xpfn,nr_pfns); + ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=%d nr_pfns=%d",region_pfn_type[i]>>28,i,k,xpfn,nr_pfns); goto out; } -#if 0 - if ( region_pfn_type[pfn] != L1TAB ) - { - ERROR("Page table mistyping"); - goto out; - } -#endif + ppage[k] &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE); ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT; } @@ -399,17 +398,21 @@ int xc_linux_restore(int xc_handle, if ( add_mmu_update(xc_handle, mmu, (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, pfn) ) + { + printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn); goto out; + } } // end of 'batch' for loop + munmap( region_base, j*PAGE_SIZE ); n+=j; // crude stats } - DPRINTF("Received all pages\n"); + printf("Received all pages\n"); - mfn_mapper_close( region_mapper ); + DPRINTF("Received all pages\n"); /* * Pin page tables. Do this after writing to them as otherwise Xen @@ -424,7 +427,8 @@ int xc_linux_restore(int xc_handle, MMU_EXTENDED_COMMAND, MMUEXT_PIN_L1_TABLE) ) { - printf("ERR pin L1 pfn=%lx mfn=%lx\n"); + printf("ERR pin L1 pfn=%lx mfn=%lx\n", + i, pfn_to_mfn_table[i]); goto out; } } @@ -435,7 +439,8 @@ int xc_linux_restore(int xc_handle, MMU_EXTENDED_COMMAND, MMUEXT_PIN_L2_TABLE) ) { - printf("ERR pin L2 pfn=%lx mfn=%lx\n"); + printf("ERR pin L2 pfn=%lx mfn=%lx\n", + i, pfn_to_mfn_table[i]); goto out; } } @@ -456,7 +461,7 @@ int xc_linux_restore(int xc_handle, /* Uncanonicalise the suspend-record frame number and poke resume rec. */ pfn = ctxt.cpu_ctxt.esi; - if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NONE) ) + if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) ) { ERROR("Suspend record frame number is bad"); goto out; @@ -477,7 +482,7 @@ int xc_linux_restore(int xc_handle, for ( i = 0; i < ctxt.gdt_ents; i += 512 ) { pfn = ctxt.gdt_frames[i]; - if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NONE) ) + if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) ) { ERROR("GDT frame number is bad"); goto out; @@ -509,37 +514,33 @@ int xc_linux_restore(int xc_handle, /* Uncanonicalise the pfn-to-mfn table frame-number list. */ - if ( (mapper_handle1 = mfn_mapper_init(xc_handle, dom, - 1024*1024, PROT_WRITE )) - == NULL ) - goto out; - for ( i = 0; i < (nr_pfns+1023)/1024; i++ ) { unsigned long pfn, mfn; pfn = pfn_to_mfn_frame_list[i]; - if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NONE) ) + if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) ) { ERROR("PFN-to-MFN frame number is bad"); goto out; } mfn = pfn_to_mfn_table[pfn]; - - mfn_mapper_queue_entry( mapper_handle1, i<<PAGE_SHIFT, - mfn, PAGE_SIZE ); + pfn_to_mfn_frame_list[i] = mfn; } - if ( mfn_mapper_flush_queue(mapper_handle1) ) + if ( (live_pfn_to_mfn_table = mfn_mapper_map_batch( xc_handle, dom, + PROT_WRITE, + pfn_to_mfn_frame_list, + (nr_pfns+1023)/1024 )) == 0 ) { ERROR("Couldn't map pfn_to_mfn table"); goto out; } - memcpy( mfn_mapper_base( mapper_handle1 ), pfn_to_mfn_table, + memcpy( live_pfn_to_mfn_table, pfn_to_mfn_table, nr_pfns*sizeof(unsigned long) ); - mfn_mapper_close( mapper_handle1 ); + munmap( live_pfn_to_mfn_table, ((nr_pfns+1023)/1024)*PAGE_SIZE ); /* * Safety checking of saved context: diff --git a/tools/xc/lib/xc_linux_save.c b/tools/xc/lib/xc_linux_save.c index 2d31b3dae2..4913a8527f 100644 --- a/tools/xc/lib/xc_linux_save.c +++ b/tools/xc/lib/xc_linux_save.c @@ -64,36 +64,94 @@ /* test_bit */ -inline int test_bit ( int nr, volatile void * addr) +static inline int test_bit ( int nr, volatile void * addr) { return ( ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >> (nr % (sizeof(unsigned long)*8) ) ) & 1; } -inline void clear_bit ( int nr, volatile void * addr) +static inline void clear_bit ( int nr, volatile void * addr) { ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &= ~(1 << (nr % (sizeof(unsigned long)*8) ) ); } -inline void set_bit ( int nr, volatile void * addr) +static inline void set_bit ( int nr, volatile void * addr) { ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |= (1 << (nr % (sizeof(unsigned long)*8) ) ); } +/* + * hweightN: returns the hamming weight (i.e. the number + * of bits set) of a N-bit word + */ + +static inline unsigned int hweight32(unsigned int w) +{ + unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555); + res = (res & 0x33333333) + ((res >> 2) & 0x33333333); + res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F); + res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF); + return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF); +} + +static inline int count_bits ( int nr, volatile void *addr) +{ + int i, count = 0; + unsigned long *p = (unsigned long *)addr; + // we know the array is padded to unsigned long + for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++) + count += hweight32( *p ); + return count; +} + +static inline int permute( int i, int nr, int order_nr ) +{ + /* Need a simple permutation function so that we scan pages in a + pseudo random order, enabling us to get a better estimate of + the domain's page dirtying rate as we go (there are often + contiguous ranges of pfns that have similar behaviour, and we + want to mix them up. */ + + /* e.g. nr->oder 15->4 16->4 17->5 */ + /* 512MB domain, 128k pages, order 17 */ + + /* + QPONMLKJIHGFEDCBA + QPONMLKJIH + GFEDCBA + */ + + /* + QPONMLKJIHGFEDCBA + EDCBA + QPONM + LKJIHGF + */ -long long tv_to_us( struct timeval *new ) + do + { + i = ( ( i>>(order_nr-10)) | ( i<<10 ) ) & + ((1<<order_nr)-1); + } + while ( i >= nr ); // this won't ever loop if nr is a power of 2 + + return i; +} + +static long long tv_to_us( struct timeval *new ) { return (new->tv_sec * 1000000) + new->tv_usec; } -long long tvdelta( struct timeval *new, struct timeval *old ) +static long long tvdelta( struct timeval *new, struct timeval *old ) { return ((new->tv_sec - old->tv_sec)*1000000 ) + (new->tv_usec - old->tv_usec); } -int track_cpu_usage( int xc_handle, u64 domid, int pages, int print ) +static int track_cpu_usage( int xc_handle, u64 domid, int faults, + int pages_sent, int pages_dirtied, int print ) { static struct timeval wall_last; static long long d0_cpu_last; @@ -123,11 +181,13 @@ int track_cpu_usage( int xc_handle, u64 domid, int pages, int print ) d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000; if(print) - printf("interval %lldms, dom0 used %lldms (%d%%), target used %lldms (%d%%), b/w %dMb/s\n", + printf("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, dirtied %dMb/s\n", wall_delta, - d0_cpu_delta, (int)((d0_cpu_delta*100)/wall_delta), - d1_cpu_delta, (int)((d1_cpu_delta*100)/wall_delta), - (int)((pages*PAGE_SIZE*8)/(wall_delta*1000))); + (int)((d0_cpu_delta*100)/wall_delta), + (int)((d1_cpu_delta*100)/wall_delta), + (int)((pages_sent*PAGE_SIZE*8)/(wall_delta*1000)), + (int)((pages_dirtied*PAGE_SIZE*8)/(wall_delta*1000)) + ); d0_cpu_last = d0_cpu_now; d1_cpu_last = d1_cpu_now; @@ -144,13 +204,14 @@ int xc_linux_save(int xc_handle, void *writerst ) { dom0_op_t op; - int rc = 1, i, j, k, n, last_iter, iter = 0; + int rc = 1, i, j, k, last_iter, iter = 0; unsigned long mfn; int verbose = flags & XCFLAGS_VERBOSE; int live = flags & XCFLAGS_LIVE; int debug = flags & XCFLAGS_DEBUG; int sent_last_iter, sent_this_iter, skip_this_iter; - + unsigned long dirtied_this_iter, faults_this_iter; + /* Important tuning parameters */ int max_iters = 29; // limit us to 30 times round loop int max_factor = 3; // never send more than 3x nr_pfns @@ -192,6 +253,9 @@ int xc_linux_save(int xc_handle, /* number of pages we're dealing with */ unsigned long nr_pfns; + /* power of 2 order of nr_pfns */ + int order_nr; + /* bitmap of pages: - that should be sent this iteration (unless later marked as skip); - to skip this iteration because already dirty; @@ -310,7 +374,7 @@ int xc_linux_save(int xc_handle, { if ( xc_shadow_control( xc_handle, domid, DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY, - NULL, 0 ) < 0 ) + NULL, 0, NULL, NULL ) < 0 ) { ERROR("Couldn't enable shadow mode"); goto out; @@ -361,6 +425,11 @@ int xc_linux_save(int xc_handle, } + /* calculate the power of 2 order of nr_pfns, e.g. + 15->4 16->4 17->5 */ + for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ ); + +printf("nr_pfns=%d order_nr=%d\n",nr_pfns, order_nr); /* We want zeroed memory so use calloc rather than malloc. */ pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long)); @@ -415,25 +484,26 @@ int xc_linux_save(int xc_handle, goto out; } - track_cpu_usage( xc_handle, domid, 0, 0); + track_cpu_usage( xc_handle, domid, 0, 0, 0, 0 ); /* Now write out each data page, canonicalising page tables as we go... */ while(1) { - unsigned int prev_pc, batch, sent_this_iter; + unsigned int prev_pc, sent_this_iter, N, batch; iter++; - sent_this_iter = 0; skip_this_iter = 0; prev_pc = 0; + N=0; + verbose_printf("Saving memory pages: iter %d 0%%", iter); - n=0; - while( n < nr_pfns ) + while( N < nr_pfns ) { - unsigned int this_pc = (n * 100) / nr_pfns; + unsigned int this_pc = (N * 100) / nr_pfns; + if ( (this_pc - prev_pc) >= 5 ) { verbose_printf("\b\b\b\b%3d%%", this_pc); @@ -444,9 +514,9 @@ int xc_linux_save(int xc_handle, but this is fast enough for the moment. */ if ( !last_iter && - xc_shadow_control( xc_handle, domid, - DOM0_SHADOW_CONTROL_OP_PEEK, - to_skip, nr_pfns ) != nr_pfns ) + xc_shadow_control(xc_handle, domid, + DOM0_SHADOW_CONTROL_OP_PEEK, + to_skip, nr_pfns, NULL, NULL) != nr_pfns ) { ERROR("Error peeking shadow bitmap"); goto out; @@ -456,8 +526,9 @@ int xc_linux_save(int xc_handle, /* load pfn_type[] with the mfn of all the pages we're doing in this batch. */ - for( batch = 0; batch < BATCH_SIZE && n < nr_pfns ; n++ ) + for( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ ) { + int n = permute(N, nr_pfns, order_nr ); if(0 && debug) fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n", @@ -528,7 +599,7 @@ int xc_linux_save(int xc_handle, for( j = 0; j < batch; j++ ) { - if((pfn_type[j]>>29) == 7) + if( (pfn_type[j] & LTAB_MASK) == XTAB) { DDPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]); continue; @@ -537,16 +608,16 @@ int xc_linux_save(int xc_handle, if(0 && debug) fprintf(stderr,"%d pfn= %08lx mfn= %08lx [mfn]= %08lx sum= %08lx\n", iter, - (pfn_type[j] & PGT_type_mask) | pfn_batch[j], + (pfn_type[j] & LTAB_MASK) | pfn_batch[j], pfn_type[j], - live_mfn_to_pfn_table[pfn_type[j]&(~PGT_type_mask)], + live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)], csum_page(region_base + (PAGE_SIZE*j)) ); /* canonicalise mfn->pfn */ - pfn_type[j] = (pfn_type[j] & PGT_type_mask) | + pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j]; - //live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask]; + //live_mfn_to_pfn_table[pfn_type[j]&~LTAB_MASK]; } @@ -568,20 +639,20 @@ int xc_linux_save(int xc_handle, { /* write out pages in batch */ - if((pfn_type[j]>>29) == 7) + if( (pfn_type[j] & LTAB_MASK) == XTAB) { DDPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]); continue; } - if ( ((pfn_type[j] & PGT_type_mask) == L1TAB) || - ((pfn_type[j] & PGT_type_mask) == L2TAB) ) + if ( ((pfn_type[j] & LTAB_MASK) == L1TAB) || + ((pfn_type[j] & LTAB_MASK) == L2TAB) ) { memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE); for ( k = 0; - k < (((pfn_type[j] & PGT_type_mask) == L2TAB) ? + k < (((pfn_type[j] & LTAB_MASK) == L2TAB) ? (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024); k++ ) { @@ -610,9 +681,9 @@ int xc_linux_save(int xc_handle, page[k] &= PAGE_SIZE - 1; page[k] |= pfn << PAGE_SHIFT; -#if DEBUG +#if 0 printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx xpfn=%d\n", - pfn_type[j]>>29, + pfn_type[j]>>28, j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT); #endif @@ -646,13 +717,13 @@ int xc_linux_save(int xc_handle, total_sent += sent_this_iter; - verbose_printf("\b\b\b\b100%% (pages sent= %d, skipped= %d )\n", - sent_this_iter, skip_this_iter ); + verbose_printf("\r %d: sent %d, skipped %d, ", + iter, sent_this_iter, skip_this_iter ); - track_cpu_usage( xc_handle, domid, sent_this_iter, 1); - if ( last_iter ) { + track_cpu_usage( xc_handle, domid, 0, sent_this_iter, 0, 1); + verbose_printf("Total pages sent= %d (%.2fx)\n", total_sent, ((float)total_sent)/nr_pfns ); verbose_printf("(of which %d were fixups)\n", needed_to_fix ); @@ -683,7 +754,7 @@ int xc_linux_save(int xc_handle, if ( // ( sent_this_iter > (sent_last_iter * 0.95) ) || (iter >= max_iters) || - (sent_this_iter+skip_this_iter < 10) || + (sent_this_iter+skip_this_iter < 50) || (total_sent > nr_pfns*max_factor) ) { DPRINTF("Start last iteration\n"); @@ -695,7 +766,8 @@ int xc_linux_save(int xc_handle, if ( xc_shadow_control( xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN2, - to_send, nr_pfns ) != nr_pfns ) + to_send, nr_pfns, &faults_this_iter, + &dirtied_this_iter) != nr_pfns ) { ERROR("Error flushing shadow PT"); goto out; @@ -703,6 +775,10 @@ int xc_linux_save(int xc_handle, sent_last_iter = sent_this_iter; + //dirtied_this_iter = count_bits( nr_pfns, to_send ); + track_cpu_usage( xc_handle, domid, faults_this_iter, + sent_this_iter, dirtied_this_iter, 1); + } diff --git a/tools/xc/lib/xc_private.c b/tools/xc/lib/xc_private.c index 9fd36455fd..32cc908d40 100644 --- a/tools/xc/lib/xc_private.c +++ b/tools/xc/lib/xc_private.c @@ -97,178 +97,9 @@ void * mfn_mapper_map_single(int xc_handle, domid_t dom, return addr; } -mfn_mapper_t * mfn_mapper_init(int xc_handle, domid_t dom, int size, int prot) -{ - mfn_mapper_t * t; - t = calloc( 1, sizeof(mfn_mapper_t)+ - mfn_mapper_queue_size*sizeof(privcmd_mmap_entry_t) ); - if (!t) return NULL; - t->xc_handle = xc_handle; - t->size = size; - t->prot = prot; - t->error = 0; - t->max_queue_size = mfn_mapper_queue_size; - t->addr = mmap( NULL, size, prot, MAP_SHARED, xc_handle, 0 ); - if (!t->addr) - { - free(t); - return NULL; - } - t->ioctl.num = 0; - t->ioctl.dom = dom; - t->ioctl.entry = (privcmd_mmap_entry_t *) &t[1]; - return t; -} - -void * mfn_mapper_base(mfn_mapper_t *t) -{ - return t->addr; -} - -void mfn_mapper_close(mfn_mapper_t *t) -{ - if(t->addr) munmap( t->addr, t->size ); - free(t); -} - -static int __mfn_mapper_flush_queue(mfn_mapper_t *t) -{ - int rc; - rc = ioctl( t->xc_handle, IOCTL_PRIVCMD_MMAP, &t->ioctl ); - t->ioctl.num = 0; - if(rc && !t->error) - t->error = rc; - return rc; -} - -int mfn_mapper_flush_queue(mfn_mapper_t *t) -{ - int rc; - - rc = __mfn_mapper_flush_queue(t); - - if ( t->error ) - { - rc = t->error; - } - - t->error = 0; - return rc; -} - -void * mfn_mapper_queue_entry(mfn_mapper_t *t, int offset, - unsigned long mfn, int size) -{ - privcmd_mmap_entry_t *entry, *prev; - int pages; - - offset &= PAGE_MASK; - pages =(size+PAGE_SIZE-1)>>PAGE_SHIFT; - entry = &t->ioctl.entry[t->ioctl.num]; - - if ( t->ioctl.num > 0 ) - { - prev = &t->ioctl.entry[t->ioctl.num-1]; - - if ( (prev->va+(prev->npages*PAGE_SIZE)) == - ((unsigned long)t->addr+offset) && - (prev->mfn+prev->npages) == mfn ) - { - prev->npages += pages; - return t->addr+offset; - } - } - - entry->va = (unsigned long)t->addr+offset; - entry->mfn = mfn; - entry->npages = pages; - t->ioctl.num++; - - if(t->ioctl.num == t->max_queue_size) - { - if ( __mfn_mapper_flush_queue(t) ) - return 0; - } - - return t->addr+offset; -} - - /*******************/ -#if 0 - -mfn_typer_t *mfn_typer_init(int xc_handle, domid_t dom, int num ) -{ - mfn_typer_t *t; - multicall_entry_t *m; - dom0_op_compact_getpageframeinfo_t *d; - - t = calloc(1, sizeof(mfn_typer_t) ); - m = calloc(num, sizeof(multicall_entry_t)); - d = calloc(num, sizeof(dom0_op_compact_getpageframeinfo_t)); - - if (!t || !m || !d) - { - if(t) free(t); - if(m) free(m); - if(d) free(d); - return NULL; - } - -printf("sizeof(m)=%d sizeof(d)=%d m=%p d=%p\n",sizeof(multicall_entry_t), sizeof(dom0_op_compact_getpageframeinfo_t),m,d); - - if ( (mlock(m, sizeof(multicall_entry_t)*num ) != 0) || - (mlock(d, sizeof(dom0_op_compact_getpageframeinfo_t)*num ) != 0) ) - { - PERROR("Could not lock memory for Xen hypercall"); - return NULL; - } - - t->xc_handle = xc_handle; - t->max = num; - t->nr_multicall_ents=0; - t->multicall_list=m; - t->gpf_list=d; - t->dom = dom; - - return t; -} - -void mfn_typer_queue_entry(mfn_typer_t *t, unsigned long mfn ) -{ - int i = t->nr_multicall_ents; - multicall_entry_t *m = &t->multicall_list[i]; - dom0_op_compact_getpageframeinfo_t *d = &t->gpf_list[i]; - - d->cmd = DOM0_GETPAGEFRAMEINFO; - d->interface_version = DOM0_INTERFACE_VERSION; - d->getpageframeinfo.pfn = mfn; - d->getpageframeinfo.domain = t->dom; - d->getpageframeinfo.type = 1000; //~0UL; - - m->op = __HYPERVISOR_dom0_op; - m->args[0] = (unsigned long)d; - - t->nr_multicall_ents++; -} - -int mfn_typer_flush_queue(mfn_typer_t *t) -{ - if (t->nr_multicall_ents == 0) return 0; - do_multicall_op(t->xc_handle, t->multicall_list, t->nr_multicall_ents); - t->nr_multicall_ents = 0; -} - -unsigned int mfn_typer_get_result(mfn_typer_t *t, int idx) -{ - return t->gpf_list[idx].getpageframeinfo.type; -} - -#endif - /* NB: arr must be mlock'ed */ - int get_pfn_type_batch(int xc_handle, u64 dom, int num, unsigned long *arr) { @@ -362,8 +193,10 @@ int add_mmu_update(int xc_handle, mmu_t *mmu, { mmu->updates[mmu->idx].ptr = ptr; mmu->updates[mmu->idx].val = val; + if ( ++mmu->idx == MAX_MMU_UPDATES ) return flush_mmu_updates(xc_handle, mmu); + return 0; } diff --git a/tools/xc/py/Xc.c b/tools/xc/py/Xc.c index 8641259c63..d2a291e4c8 100644 --- a/tools/xc/py/Xc.c +++ b/tools/xc/py/Xc.c @@ -380,7 +380,7 @@ static PyObject *pyxc_linux_restore(PyObject *self, do { rc = read( (int) fd, ((char*)buf)+tot, count-tot ); if ( rc < 0 ) { perror("READ"); return rc; } - if ( rc == 0 ) { printf("read: need %d, tot=%d got zero\n"); return -1; } + if ( rc == 0 ) { printf("read: need %d, tot=%d got zero\n",count-tot,tot); return -1; } tot += rc; } while ( tot < count ); @@ -1296,7 +1296,7 @@ static PyObject *pyxc_shadow_control(PyObject *self, &dom, &op) ) return NULL; - if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0) < 0 ) + if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, NULL, NULL) < 0 ) return PyErr_SetFromErrno(xc_error); Py_INCREF(zero); diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c index 586a3a1270..b4baf6c141 100644 --- a/xen/common/dom0_ops.c +++ b/xen/common/dom0_ops.c @@ -397,7 +397,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op) { ret = 0; - op->u.getpageframeinfo.type = NONE; + op->u.getpageframeinfo.type = NOTAB; if ( (page->type_and_flags & PGT_count_mask) != 0 ) { @@ -645,11 +645,17 @@ long do_dom0_op(dom0_op_t *u_dom0_op) switch( page->type_and_flags & PGT_type_mask ) { case PGT_l1_page_table: + type = L1TAB; + break; case PGT_l2_page_table: + type = L2TAB; + break; case PGT_l3_page_table: + type = L3TAB; + break; case PGT_l4_page_table: - type = page->type_and_flags & PGT_type_mask; - + type = L4TAB; + break; } l_arr[j] |= type; put_page(page); @@ -657,7 +663,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op) else { e2_err: - l_arr[j] |= PGT_type_mask; /* error */ + l_arr[j] |= XTAB; } } diff --git a/xen/common/shadow.c b/xen/common/shadow.c index f222419b25..d9e4d9aef8 100644 --- a/xen/common/shadow.c +++ b/xen/common/shadow.c @@ -33,12 +33,34 @@ hypercall lock anyhow (at least initially). FIXME: -1. Flush needs to avoid blowing away the L2 page that another CPU may be using! - -fix using cpu_raise_softirq - -have a flag to count in, (after switching to init's PTs) -spinlock, reload cr3_shadow, unlock +The shadow table flush command is dangerous on SMP systems as the +guest may be using the L2 on one CPU while the other is trying to +blow the table away. + +The current save restore code works around this by not calling FLUSH, +but by calling CLEAN2 which leaves all L2s in tact (this is probably +quicker anyhow). + +Even so, we have to be very careful. The flush code may need to cause +a TLB flush on another CPU. It needs to do this while holding the +shadow table lock. The trouble is, the guest may be in the shadow page +fault handler spinning waiting to grab the shadow lock. It may have +intterupts disabled, hence we can't use the normal flush_tlb_cpu +mechanism. + +For the moment, we have a grim hace whereby the spinlock in the shadow +fault handler is actually a try lock, in a loop with a helper for the +tlb flush code. + +A better soloution would be to take a new flush lock, then raise a +per-domain soft irq on the other CPU. The softirq will switch to +init's PTs, then do an atomic inc of a variable to count himself in, +then spin on a lock. Having noticed that the other guy has counted +in, flush the shadow table, then release him by dropping the lock. He +will then reload cr3 from mm.page_table on the way out of the softirq. + +In domian-softirq context we know that the guy holds no locks and has +interrupts enabled. Nothing can go wrong ;-) **/ @@ -364,6 +386,11 @@ static int shadow_mode_table_op( struct task_struct *p, rc = -EINVAL; goto out; } + + sc->fault_count = p->mm.shadow_fault_count; + sc->dirty_count = p->mm.shadow_dirty_count; + p->mm.shadow_fault_count = 0; + p->mm.shadow_dirty_count = 0; sc->pages = p->tot_pages; @@ -746,6 +773,8 @@ int shadow_fault( unsigned long va, long error_code ) perfc_incrc(shadow_fixup_count); + m->shadow_fault_count++; + check_pagetable( current, current->mm.pagetable, "post-sf" ); spin_unlock(&m->shadow_lock); diff --git a/xen/include/asm-i386/processor.h b/xen/include/asm-i386/processor.h index c2e36d6191..b1eab99d56 100644 --- a/xen/include/asm-i386/processor.h +++ b/xen/include/asm-i386/processor.h @@ -445,17 +445,27 @@ struct mm_struct { l1_pgentry_t *perdomain_pt; pagetable_t pagetable; + /* shadow mode status and controls */ unsigned int shadow_mode; /* flags to control shadow table operation */ pagetable_t shadow_table; spinlock_t shadow_lock; + unsigned int shadow_max_page_count; // currently unused + + /* shadow hashtable */ struct shadow_status *shadow_ht; struct shadow_status *shadow_ht_free; struct shadow_status *shadow_ht_extras; /* extra allocation units */ + unsigned int shadow_extras_count; + + /* shadow dirty bitmap */ unsigned long *shadow_dirty_bitmap; unsigned int shadow_dirty_bitmap_size; /* in pages, bit per page */ - unsigned int shadow_page_count; - unsigned int shadow_max_page_count; - unsigned int shadow_extras_count; + + /* shadow mode stats */ + unsigned int shadow_page_count; + unsigned int shadow_fault_count; + unsigned int shadow_dirty_count; + /* Current LDT details. */ unsigned long ldt_base, ldt_ents, shadow_ldt_mapcnt; diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h index 2a17605bf2..6273878d16 100644 --- a/xen/include/hypervisor-ifs/dom0_ops.h +++ b/xen/include/hypervisor-ifs/dom0_ops.h @@ -150,6 +150,13 @@ typedef struct dom0_settime_st } dom0_settime_t; #define DOM0_GETPAGEFRAMEINFO 18 +#define NOTAB 0 /* normal page */ +#define L1TAB (1<<28) +#define L2TAB (2<<28) +#define L3TAB (3<<28) +#define L4TAB (4<<28) +#define XTAB (0xf<<28) /* invalid page */ +#define LTAB_MASK XTAB typedef struct dom0_getpageframeinfo_st { /* IN variables. */ @@ -157,8 +164,7 @@ typedef struct dom0_getpageframeinfo_st domid_t domain; /* To which domain does the frame belong? */ /* OUT variables. */ /* Is the page PINNED to a type? */ - enum { NONE, L1TAB=(1<<29), L2TAB=(2<<29), L3TAB=(3<<29), L4TAB=(4<<29) } type; -#define PGT_type_mask (7<<29) + unsigned long type; /* see above type defs */ } dom0_getpageframeinfo_t; @@ -251,6 +257,9 @@ typedef struct dom0_shadow_control_st unsigned long *dirty_bitmap; // pointe to mlocked buffer /* IN/OUT variables */ unsigned long pages; // size of buffer, updated with actual size + /* OUT varaibles */ + unsigned long fault_count; + unsigned long dirty_count; } dom0_shadow_control_t; #define DOM0_SETDOMAINNAME 26 diff --git a/xen/include/xen/shadow.h b/xen/include/xen/shadow.h index f1ce8b6689..1597e1feb2 100644 --- a/xen/include/xen/shadow.h +++ b/xen/include/xen/shadow.h @@ -95,8 +95,12 @@ printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \ ASSERT(m->shadow_dirty_bitmap); if( likely(pfn<m->shadow_dirty_bitmap_size) ) { - /* These updates occur with mm.shadow_lock held */ - __set_bit( pfn, m->shadow_dirty_bitmap ); + /* These updates occur with mm.shadow_lock held, so use + (__) version of test_and_set */ + if( ! __test_and_set_bit( pfn, m->shadow_dirty_bitmap ) ) + { + m->shadow_dirty_count++; + } } else { diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c index 5bb4f4f4b9..a4b01eb83f 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c @@ -172,7 +172,7 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, v); if ( unlikely(HYPERVISOR_mmu_update(u, v - u + 1, NULL) < 0) ) - put_user( 0xe0000000 | mfn, p ); + put_user( 0xF0000000 | mfn, p ); v = w; } |