aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoriap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk>2004-05-24 18:13:06 +0000
committeriap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk>2004-05-24 18:13:06 +0000
commit84b3199c7847e50cb3d777127327674b21540801 (patch)
treea31f8c33c41b7ba3f66c16fa65b7f9895540c5a6
parentfdb0b778782b5a8606b37689bbfbe004fdac2722 (diff)
downloadxen-84b3199c7847e50cb3d777127327674b21540801.tar.gz
xen-84b3199c7847e50cb3d777127327674b21540801.tar.bz2
xen-84b3199c7847e50cb3d777127327674b21540801.zip
bitkeeper revision 1.921 (40b23b32vMbvKCdgtCukVpQP62ZvYA)
Add more stats to migration code, scan pages in psuedo random permutation, misc cleanups.
-rw-r--r--tools/xc/lib/xc.h4
-rw-r--r--tools/xc/lib/xc_domain.c7
-rw-r--r--tools/xc/lib/xc_linux_restore.c109
-rw-r--r--tools/xc/lib/xc_linux_save.c156
-rw-r--r--tools/xc/lib/xc_private.c171
-rw-r--r--tools/xc/py/Xc.c4
-rw-r--r--xen/common/dom0_ops.c14
-rw-r--r--xen/common/shadow.c41
-rw-r--r--xen/include/asm-i386/processor.h16
-rw-r--r--xen/include/hypervisor-ifs/dom0_ops.h13
-rw-r--r--xen/include/xen/shadow.h8
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c2
12 files changed, 260 insertions, 285 deletions
diff --git a/tools/xc/lib/xc.h b/tools/xc/lib/xc.h
index 223710fad1..3423c4ec46 100644
--- a/tools/xc/lib/xc.h
+++ b/tools/xc/lib/xc.h
@@ -59,7 +59,9 @@ int xc_shadow_control(int xc_handle,
u64 domid,
unsigned int sop,
unsigned long *dirty_bitmap,
- unsigned long pages);
+ unsigned long pages,
+ unsigned long *fault_count,
+ unsigned long *dirty_count);
#define XCFLAGS_VERBOSE 1
diff --git a/tools/xc/lib/xc_domain.c b/tools/xc/lib/xc_domain.c
index 97b2b26f6d..dd8b4ec35f 100644
--- a/tools/xc/lib/xc_domain.c
+++ b/tools/xc/lib/xc_domain.c
@@ -112,7 +112,9 @@ int xc_shadow_control(int xc_handle,
u64 domid,
unsigned int sop,
unsigned long *dirty_bitmap,
- unsigned long pages)
+ unsigned long pages,
+ unsigned long *fault_count,
+ unsigned long *dirty_count)
{
int rc;
dom0_op_t op;
@@ -124,6 +126,9 @@ int xc_shadow_control(int xc_handle,
rc = do_dom0_op(xc_handle, &op);
+ if(fault_count) *fault_count = op.u.shadow_control.fault_count;
+ if(dirty_count) *dirty_count = op.u.shadow_control.dirty_count;
+
if ( rc == 0 )
return op.u.shadow_control.pages;
else
diff --git a/tools/xc/lib/xc_linux_restore.c b/tools/xc/lib/xc_linux_restore.c
index 4e89b5715f..e756ad6ffd 100644
--- a/tools/xc/lib/xc_linux_restore.c
+++ b/tools/xc/lib/xc_linux_restore.c
@@ -88,6 +88,9 @@ int xc_linux_restore(int xc_handle,
/* A table containg the type of each PFN (/not/ MFN!). */
unsigned long *pfn_type = NULL;
+ /* A table of MFNs to map in the current region */
+ unsigned long *region_mfn = NULL;
+
/* A temporary mapping, and a copy, of one frame of guest memory. */
unsigned long *ppage;
@@ -97,10 +100,12 @@ int xc_linux_restore(int xc_handle,
/* A table mapping each PFN to its new MFN. */
unsigned long *pfn_to_mfn_table = NULL;
+ /* used by mapper for updating the domain's copy of the table */
+ unsigned long *live_pfn_to_mfn_table = NULL;
+
/* A temporary mapping of the guest's suspend record. */
suspend_record_t *p_srec;
- mfn_mapper_t *region_mapper, *mapper_handle1;
char *region_base;
mmu_t *mmu = NULL;
@@ -154,12 +159,20 @@ int xc_linux_restore(int xc_handle,
/* We want zeroed memory so use calloc rather than malloc. */
pfn_to_mfn_table = calloc(1, 4 * nr_pfns);
pfn_type = calloc(1, 4 * nr_pfns);
+ region_mfn = calloc(1, 4 * MAX_BATCH_SIZE);
- if ( (pfn_to_mfn_table == NULL) || (pfn_type == NULL) )
+ if ( (pfn_to_mfn_table == NULL) || (pfn_type == NULL) ||
+ (region_mfn == NULL) )
{
errno = ENOMEM;
goto out;
}
+
+ if ( mlock(region_mfn, 4 * MAX_BATCH_SIZE ) )
+ {
+ ERROR("Could not mlock region_mfn");
+ goto out;
+ }
/* Set the domain's name to that from the restore file */
if ( xc_domain_setname( xc_handle, dom, name ) )
@@ -206,15 +219,6 @@ int xc_linux_restore(int xc_handle,
goto out;
}
-
- if ( (region_mapper = mfn_mapper_init(xc_handle, dom,
- MAX_BATCH_SIZE*PAGE_SIZE,
- PROT_WRITE ))
- == NULL )
- goto out;
-
- region_base = mfn_mapper_base( region_mapper );
-
verbose_printf("Reloading memory pages: 0%%");
/*
@@ -227,7 +231,7 @@ int xc_linux_restore(int xc_handle,
while(1)
{
int j;
- unsigned long region_pfn_type[1024];
+ unsigned long region_pfn_type[MAX_BATCH_SIZE];
this_pc = (n * 100) / nr_pfns;
if ( (this_pc - prev_pc) >= 5 )
@@ -270,30 +274,31 @@ int xc_linux_restore(int xc_handle,
for(i=0;i<j;i++)
{
- if ((region_pfn_type[i]>>29) == 7)
- continue;
-
- pfn = region_pfn_type[i] & ~PGT_type_mask;
- mfn = pfn_to_mfn_table[pfn];
-
- mfn_mapper_queue_entry( region_mapper, i<<PAGE_SHIFT,
- mfn, PAGE_SIZE );
+ if ( (region_pfn_type[i] & LTAB_MASK) == XTAB)
+ region_mfn[i] = 0; // we know map will fail, but don't care
+ else
+ {
+ pfn = region_pfn_type[i] & ~LTAB_MASK;
+ region_mfn[i] = pfn_to_mfn_table[pfn];
+ }
}
-
- if( mfn_mapper_flush_queue(region_mapper) )
+
+ if ( (region_base = mfn_mapper_map_batch( xc_handle, dom,
+ PROT_WRITE,
+ region_mfn,
+ j )) == 0)
{
- ERROR("Couldn't map page region");
+ PERROR("map batch failed");
goto out;
}
-
for(i=0;i<j;i++)
{
unsigned long *ppage;
- pfn = region_pfn_type[i] & ~PGT_type_mask;
+ pfn = region_pfn_type[i] & ~LTAB_MASK;
- if ((region_pfn_type[i]>>29) == 7)
+ if ( (region_pfn_type[i] & LTAB_MASK) == XTAB)
continue;
if (pfn>nr_pfns)
@@ -302,7 +307,7 @@ int xc_linux_restore(int xc_handle,
goto out;
}
- region_pfn_type[i] &= PGT_type_mask;
+ region_pfn_type[i] &= LTAB_MASK;
pfn_type[pfn] = region_pfn_type[i];
@@ -334,7 +339,7 @@ int xc_linux_restore(int xc_handle,
if ( xpfn >= nr_pfns )
{
- ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=0x%x nr_pfns=%d",region_pfn_type[i]>>29,i,k,xpfn,nr_pfns);
+ ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=0x%x nr_pfns=%d",region_pfn_type[i]>>28,i,k,xpfn,nr_pfns);
goto out;
}
@@ -355,17 +360,11 @@ int xc_linux_restore(int xc_handle,
if ( xpfn >= nr_pfns )
{
- ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=%d nr_pfns=%d",region_pfn_type[i]>>29,i,k,xpfn,nr_pfns);
+ ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=%d nr_pfns=%d",region_pfn_type[i]>>28,i,k,xpfn,nr_pfns);
goto out;
}
-#if 0
- if ( region_pfn_type[pfn] != L1TAB )
- {
- ERROR("Page table mistyping");
- goto out;
- }
-#endif
+
ppage[k] &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE);
ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
}
@@ -399,17 +398,21 @@ int xc_linux_restore(int xc_handle,
if ( add_mmu_update(xc_handle, mmu,
(mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, pfn) )
+ {
+ printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn);
goto out;
+ }
} // end of 'batch' for loop
+ munmap( region_base, j*PAGE_SIZE );
n+=j; // crude stats
}
- DPRINTF("Received all pages\n");
+ printf("Received all pages\n");
- mfn_mapper_close( region_mapper );
+ DPRINTF("Received all pages\n");
/*
* Pin page tables. Do this after writing to them as otherwise Xen
@@ -424,7 +427,8 @@ int xc_linux_restore(int xc_handle,
MMU_EXTENDED_COMMAND,
MMUEXT_PIN_L1_TABLE) )
{
- printf("ERR pin L1 pfn=%lx mfn=%lx\n");
+ printf("ERR pin L1 pfn=%lx mfn=%lx\n",
+ i, pfn_to_mfn_table[i]);
goto out;
}
}
@@ -435,7 +439,8 @@ int xc_linux_restore(int xc_handle,
MMU_EXTENDED_COMMAND,
MMUEXT_PIN_L2_TABLE) )
{
- printf("ERR pin L2 pfn=%lx mfn=%lx\n");
+ printf("ERR pin L2 pfn=%lx mfn=%lx\n",
+ i, pfn_to_mfn_table[i]);
goto out;
}
}
@@ -456,7 +461,7 @@ int xc_linux_restore(int xc_handle,
/* Uncanonicalise the suspend-record frame number and poke resume rec. */
pfn = ctxt.cpu_ctxt.esi;
- if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NONE) )
+ if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
{
ERROR("Suspend record frame number is bad");
goto out;
@@ -477,7 +482,7 @@ int xc_linux_restore(int xc_handle,
for ( i = 0; i < ctxt.gdt_ents; i += 512 )
{
pfn = ctxt.gdt_frames[i];
- if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NONE) )
+ if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
{
ERROR("GDT frame number is bad");
goto out;
@@ -509,37 +514,33 @@ int xc_linux_restore(int xc_handle,
/* Uncanonicalise the pfn-to-mfn table frame-number list. */
- if ( (mapper_handle1 = mfn_mapper_init(xc_handle, dom,
- 1024*1024, PROT_WRITE ))
- == NULL )
- goto out;
-
for ( i = 0; i < (nr_pfns+1023)/1024; i++ )
{
unsigned long pfn, mfn;
pfn = pfn_to_mfn_frame_list[i];
- if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NONE) )
+ if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
{
ERROR("PFN-to-MFN frame number is bad");
goto out;
}
mfn = pfn_to_mfn_table[pfn];
-
- mfn_mapper_queue_entry( mapper_handle1, i<<PAGE_SHIFT,
- mfn, PAGE_SIZE );
+ pfn_to_mfn_frame_list[i] = mfn;
}
- if ( mfn_mapper_flush_queue(mapper_handle1) )
+ if ( (live_pfn_to_mfn_table = mfn_mapper_map_batch( xc_handle, dom,
+ PROT_WRITE,
+ pfn_to_mfn_frame_list,
+ (nr_pfns+1023)/1024 )) == 0 )
{
ERROR("Couldn't map pfn_to_mfn table");
goto out;
}
- memcpy( mfn_mapper_base( mapper_handle1 ), pfn_to_mfn_table,
+ memcpy( live_pfn_to_mfn_table, pfn_to_mfn_table,
nr_pfns*sizeof(unsigned long) );
- mfn_mapper_close( mapper_handle1 );
+ munmap( live_pfn_to_mfn_table, ((nr_pfns+1023)/1024)*PAGE_SIZE );
/*
* Safety checking of saved context:
diff --git a/tools/xc/lib/xc_linux_save.c b/tools/xc/lib/xc_linux_save.c
index 2d31b3dae2..4913a8527f 100644
--- a/tools/xc/lib/xc_linux_save.c
+++ b/tools/xc/lib/xc_linux_save.c
@@ -64,36 +64,94 @@
/* test_bit */
-inline int test_bit ( int nr, volatile void * addr)
+static inline int test_bit ( int nr, volatile void * addr)
{
return ( ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
(nr % (sizeof(unsigned long)*8) ) ) & 1;
}
-inline void clear_bit ( int nr, volatile void * addr)
+static inline void clear_bit ( int nr, volatile void * addr)
{
((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
~(1 << (nr % (sizeof(unsigned long)*8) ) );
}
-inline void set_bit ( int nr, volatile void * addr)
+static inline void set_bit ( int nr, volatile void * addr)
{
((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
(1 << (nr % (sizeof(unsigned long)*8) ) );
}
+/*
+ * hweightN: returns the hamming weight (i.e. the number
+ * of bits set) of a N-bit word
+ */
+
+static inline unsigned int hweight32(unsigned int w)
+{
+ unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
+ res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+ res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
+ res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
+ return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
+}
+
+static inline int count_bits ( int nr, volatile void *addr)
+{
+ int i, count = 0;
+ unsigned long *p = (unsigned long *)addr;
+ // we know the array is padded to unsigned long
+ for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++)
+ count += hweight32( *p );
+ return count;
+}
+
+static inline int permute( int i, int nr, int order_nr )
+{
+ /* Need a simple permutation function so that we scan pages in a
+ pseudo random order, enabling us to get a better estimate of
+ the domain's page dirtying rate as we go (there are often
+ contiguous ranges of pfns that have similar behaviour, and we
+ want to mix them up. */
+
+ /* e.g. nr->oder 15->4 16->4 17->5 */
+ /* 512MB domain, 128k pages, order 17 */
+
+ /*
+ QPONMLKJIHGFEDCBA
+ QPONMLKJIH
+ GFEDCBA
+ */
+
+ /*
+ QPONMLKJIHGFEDCBA
+ EDCBA
+ QPONM
+ LKJIHGF
+ */
-long long tv_to_us( struct timeval *new )
+ do
+ {
+ i = ( ( i>>(order_nr-10)) | ( i<<10 ) ) &
+ ((1<<order_nr)-1);
+ }
+ while ( i >= nr ); // this won't ever loop if nr is a power of 2
+
+ return i;
+}
+
+static long long tv_to_us( struct timeval *new )
{
return (new->tv_sec * 1000000) + new->tv_usec;
}
-long long tvdelta( struct timeval *new, struct timeval *old )
+static long long tvdelta( struct timeval *new, struct timeval *old )
{
return ((new->tv_sec - old->tv_sec)*1000000 ) +
(new->tv_usec - old->tv_usec);
}
-int track_cpu_usage( int xc_handle, u64 domid, int pages, int print )
+static int track_cpu_usage( int xc_handle, u64 domid, int faults,
+ int pages_sent, int pages_dirtied, int print )
{
static struct timeval wall_last;
static long long d0_cpu_last;
@@ -123,11 +181,13 @@ int track_cpu_usage( int xc_handle, u64 domid, int pages, int print )
d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
if(print)
- printf("interval %lldms, dom0 used %lldms (%d%%), target used %lldms (%d%%), b/w %dMb/s\n",
+ printf("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, dirtied %dMb/s\n",
wall_delta,
- d0_cpu_delta, (int)((d0_cpu_delta*100)/wall_delta),
- d1_cpu_delta, (int)((d1_cpu_delta*100)/wall_delta),
- (int)((pages*PAGE_SIZE*8)/(wall_delta*1000)));
+ (int)((d0_cpu_delta*100)/wall_delta),
+ (int)((d1_cpu_delta*100)/wall_delta),
+ (int)((pages_sent*PAGE_SIZE*8)/(wall_delta*1000)),
+ (int)((pages_dirtied*PAGE_SIZE*8)/(wall_delta*1000))
+ );
d0_cpu_last = d0_cpu_now;
d1_cpu_last = d1_cpu_now;
@@ -144,13 +204,14 @@ int xc_linux_save(int xc_handle,
void *writerst )
{
dom0_op_t op;
- int rc = 1, i, j, k, n, last_iter, iter = 0;
+ int rc = 1, i, j, k, last_iter, iter = 0;
unsigned long mfn;
int verbose = flags & XCFLAGS_VERBOSE;
int live = flags & XCFLAGS_LIVE;
int debug = flags & XCFLAGS_DEBUG;
int sent_last_iter, sent_this_iter, skip_this_iter;
-
+ unsigned long dirtied_this_iter, faults_this_iter;
+
/* Important tuning parameters */
int max_iters = 29; // limit us to 30 times round loop
int max_factor = 3; // never send more than 3x nr_pfns
@@ -192,6 +253,9 @@ int xc_linux_save(int xc_handle,
/* number of pages we're dealing with */
unsigned long nr_pfns;
+ /* power of 2 order of nr_pfns */
+ int order_nr;
+
/* bitmap of pages:
- that should be sent this iteration (unless later marked as skip);
- to skip this iteration because already dirty;
@@ -310,7 +374,7 @@ int xc_linux_save(int xc_handle,
{
if ( xc_shadow_control( xc_handle, domid,
DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
- NULL, 0 ) < 0 )
+ NULL, 0, NULL, NULL ) < 0 )
{
ERROR("Couldn't enable shadow mode");
goto out;
@@ -361,6 +425,11 @@ int xc_linux_save(int xc_handle,
}
+ /* calculate the power of 2 order of nr_pfns, e.g.
+ 15->4 16->4 17->5 */
+ for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ );
+
+printf("nr_pfns=%d order_nr=%d\n",nr_pfns, order_nr);
/* We want zeroed memory so use calloc rather than malloc. */
pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
@@ -415,25 +484,26 @@ int xc_linux_save(int xc_handle,
goto out;
}
- track_cpu_usage( xc_handle, domid, 0, 0);
+ track_cpu_usage( xc_handle, domid, 0, 0, 0, 0 );
/* Now write out each data page, canonicalising page tables as we go... */
while(1)
{
- unsigned int prev_pc, batch, sent_this_iter;
+ unsigned int prev_pc, sent_this_iter, N, batch;
iter++;
-
sent_this_iter = 0;
skip_this_iter = 0;
prev_pc = 0;
+ N=0;
+
verbose_printf("Saving memory pages: iter %d 0%%", iter);
- n=0;
- while( n < nr_pfns )
+ while( N < nr_pfns )
{
- unsigned int this_pc = (n * 100) / nr_pfns;
+ unsigned int this_pc = (N * 100) / nr_pfns;
+
if ( (this_pc - prev_pc) >= 5 )
{
verbose_printf("\b\b\b\b%3d%%", this_pc);
@@ -444,9 +514,9 @@ int xc_linux_save(int xc_handle,
but this is fast enough for the moment. */
if ( !last_iter &&
- xc_shadow_control( xc_handle, domid,
- DOM0_SHADOW_CONTROL_OP_PEEK,
- to_skip, nr_pfns ) != nr_pfns )
+ xc_shadow_control(xc_handle, domid,
+ DOM0_SHADOW_CONTROL_OP_PEEK,
+ to_skip, nr_pfns, NULL, NULL) != nr_pfns )
{
ERROR("Error peeking shadow bitmap");
goto out;
@@ -456,8 +526,9 @@ int xc_linux_save(int xc_handle,
/* load pfn_type[] with the mfn of all the pages we're doing in
this batch. */
- for( batch = 0; batch < BATCH_SIZE && n < nr_pfns ; n++ )
+ for( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
{
+ int n = permute(N, nr_pfns, order_nr );
if(0 && debug)
fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
@@ -528,7 +599,7 @@ int xc_linux_save(int xc_handle,
for( j = 0; j < batch; j++ )
{
- if((pfn_type[j]>>29) == 7)
+ if( (pfn_type[j] & LTAB_MASK) == XTAB)
{
DDPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
continue;
@@ -537,16 +608,16 @@ int xc_linux_save(int xc_handle,
if(0 && debug)
fprintf(stderr,"%d pfn= %08lx mfn= %08lx [mfn]= %08lx sum= %08lx\n",
iter,
- (pfn_type[j] & PGT_type_mask) | pfn_batch[j],
+ (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
pfn_type[j],
- live_mfn_to_pfn_table[pfn_type[j]&(~PGT_type_mask)],
+ live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
csum_page(region_base + (PAGE_SIZE*j))
);
/* canonicalise mfn->pfn */
- pfn_type[j] = (pfn_type[j] & PGT_type_mask) |
+ pfn_type[j] = (pfn_type[j] & LTAB_MASK) |
pfn_batch[j];
- //live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask];
+ //live_mfn_to_pfn_table[pfn_type[j]&~LTAB_MASK];
}
@@ -568,20 +639,20 @@ int xc_linux_save(int xc_handle,
{
/* write out pages in batch */
- if((pfn_type[j]>>29) == 7)
+ if( (pfn_type[j] & LTAB_MASK) == XTAB)
{
DDPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
continue;
}
- if ( ((pfn_type[j] & PGT_type_mask) == L1TAB) ||
- ((pfn_type[j] & PGT_type_mask) == L2TAB) )
+ if ( ((pfn_type[j] & LTAB_MASK) == L1TAB) ||
+ ((pfn_type[j] & LTAB_MASK) == L2TAB) )
{
memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
for ( k = 0;
- k < (((pfn_type[j] & PGT_type_mask) == L2TAB) ?
+ k < (((pfn_type[j] & LTAB_MASK) == L2TAB) ?
(HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024);
k++ )
{
@@ -610,9 +681,9 @@ int xc_linux_save(int xc_handle,
page[k] &= PAGE_SIZE - 1;
page[k] |= pfn << PAGE_SHIFT;
-#if DEBUG
+#if 0
printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx xpfn=%d\n",
- pfn_type[j]>>29,
+ pfn_type[j]>>28,
j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
#endif
@@ -646,13 +717,13 @@ int xc_linux_save(int xc_handle,
total_sent += sent_this_iter;
- verbose_printf("\b\b\b\b100%% (pages sent= %d, skipped= %d )\n",
- sent_this_iter, skip_this_iter );
+ verbose_printf("\r %d: sent %d, skipped %d, ",
+ iter, sent_this_iter, skip_this_iter );
- track_cpu_usage( xc_handle, domid, sent_this_iter, 1);
-
if ( last_iter )
{
+ track_cpu_usage( xc_handle, domid, 0, sent_this_iter, 0, 1);
+
verbose_printf("Total pages sent= %d (%.2fx)\n",
total_sent, ((float)total_sent)/nr_pfns );
verbose_printf("(of which %d were fixups)\n", needed_to_fix );
@@ -683,7 +754,7 @@ int xc_linux_save(int xc_handle,
if (
// ( sent_this_iter > (sent_last_iter * 0.95) ) ||
(iter >= max_iters) ||
- (sent_this_iter+skip_this_iter < 10) ||
+ (sent_this_iter+skip_this_iter < 50) ||
(total_sent > nr_pfns*max_factor) )
{
DPRINTF("Start last iteration\n");
@@ -695,7 +766,8 @@ int xc_linux_save(int xc_handle,
if ( xc_shadow_control( xc_handle, domid,
DOM0_SHADOW_CONTROL_OP_CLEAN2,
- to_send, nr_pfns ) != nr_pfns )
+ to_send, nr_pfns, &faults_this_iter,
+ &dirtied_this_iter) != nr_pfns )
{
ERROR("Error flushing shadow PT");
goto out;
@@ -703,6 +775,10 @@ int xc_linux_save(int xc_handle,
sent_last_iter = sent_this_iter;
+ //dirtied_this_iter = count_bits( nr_pfns, to_send );
+ track_cpu_usage( xc_handle, domid, faults_this_iter,
+ sent_this_iter, dirtied_this_iter, 1);
+
}
diff --git a/tools/xc/lib/xc_private.c b/tools/xc/lib/xc_private.c
index 9fd36455fd..32cc908d40 100644
--- a/tools/xc/lib/xc_private.c
+++ b/tools/xc/lib/xc_private.c
@@ -97,178 +97,9 @@ void * mfn_mapper_map_single(int xc_handle, domid_t dom,
return addr;
}
-mfn_mapper_t * mfn_mapper_init(int xc_handle, domid_t dom, int size, int prot)
-{
- mfn_mapper_t * t;
- t = calloc( 1, sizeof(mfn_mapper_t)+
- mfn_mapper_queue_size*sizeof(privcmd_mmap_entry_t) );
- if (!t) return NULL;
- t->xc_handle = xc_handle;
- t->size = size;
- t->prot = prot;
- t->error = 0;
- t->max_queue_size = mfn_mapper_queue_size;
- t->addr = mmap( NULL, size, prot, MAP_SHARED, xc_handle, 0 );
- if (!t->addr)
- {
- free(t);
- return NULL;
- }
- t->ioctl.num = 0;
- t->ioctl.dom = dom;
- t->ioctl.entry = (privcmd_mmap_entry_t *) &t[1];
- return t;
-}
-
-void * mfn_mapper_base(mfn_mapper_t *t)
-{
- return t->addr;
-}
-
-void mfn_mapper_close(mfn_mapper_t *t)
-{
- if(t->addr) munmap( t->addr, t->size );
- free(t);
-}
-
-static int __mfn_mapper_flush_queue(mfn_mapper_t *t)
-{
- int rc;
- rc = ioctl( t->xc_handle, IOCTL_PRIVCMD_MMAP, &t->ioctl );
- t->ioctl.num = 0;
- if(rc && !t->error)
- t->error = rc;
- return rc;
-}
-
-int mfn_mapper_flush_queue(mfn_mapper_t *t)
-{
- int rc;
-
- rc = __mfn_mapper_flush_queue(t);
-
- if ( t->error )
- {
- rc = t->error;
- }
-
- t->error = 0;
- return rc;
-}
-
-void * mfn_mapper_queue_entry(mfn_mapper_t *t, int offset,
- unsigned long mfn, int size)
-{
- privcmd_mmap_entry_t *entry, *prev;
- int pages;
-
- offset &= PAGE_MASK;
- pages =(size+PAGE_SIZE-1)>>PAGE_SHIFT;
- entry = &t->ioctl.entry[t->ioctl.num];
-
- if ( t->ioctl.num > 0 )
- {
- prev = &t->ioctl.entry[t->ioctl.num-1];
-
- if ( (prev->va+(prev->npages*PAGE_SIZE)) ==
- ((unsigned long)t->addr+offset) &&
- (prev->mfn+prev->npages) == mfn )
- {
- prev->npages += pages;
- return t->addr+offset;
- }
- }
-
- entry->va = (unsigned long)t->addr+offset;
- entry->mfn = mfn;
- entry->npages = pages;
- t->ioctl.num++;
-
- if(t->ioctl.num == t->max_queue_size)
- {
- if ( __mfn_mapper_flush_queue(t) )
- return 0;
- }
-
- return t->addr+offset;
-}
-
-
/*******************/
-#if 0
-
-mfn_typer_t *mfn_typer_init(int xc_handle, domid_t dom, int num )
-{
- mfn_typer_t *t;
- multicall_entry_t *m;
- dom0_op_compact_getpageframeinfo_t *d;
-
- t = calloc(1, sizeof(mfn_typer_t) );
- m = calloc(num, sizeof(multicall_entry_t));
- d = calloc(num, sizeof(dom0_op_compact_getpageframeinfo_t));
-
- if (!t || !m || !d)
- {
- if(t) free(t);
- if(m) free(m);
- if(d) free(d);
- return NULL;
- }
-
-printf("sizeof(m)=%d sizeof(d)=%d m=%p d=%p\n",sizeof(multicall_entry_t), sizeof(dom0_op_compact_getpageframeinfo_t),m,d);
-
- if ( (mlock(m, sizeof(multicall_entry_t)*num ) != 0) ||
- (mlock(d, sizeof(dom0_op_compact_getpageframeinfo_t)*num ) != 0) )
- {
- PERROR("Could not lock memory for Xen hypercall");
- return NULL;
- }
-
- t->xc_handle = xc_handle;
- t->max = num;
- t->nr_multicall_ents=0;
- t->multicall_list=m;
- t->gpf_list=d;
- t->dom = dom;
-
- return t;
-}
-
-void mfn_typer_queue_entry(mfn_typer_t *t, unsigned long mfn )
-{
- int i = t->nr_multicall_ents;
- multicall_entry_t *m = &t->multicall_list[i];
- dom0_op_compact_getpageframeinfo_t *d = &t->gpf_list[i];
-
- d->cmd = DOM0_GETPAGEFRAMEINFO;
- d->interface_version = DOM0_INTERFACE_VERSION;
- d->getpageframeinfo.pfn = mfn;
- d->getpageframeinfo.domain = t->dom;
- d->getpageframeinfo.type = 1000; //~0UL;
-
- m->op = __HYPERVISOR_dom0_op;
- m->args[0] = (unsigned long)d;
-
- t->nr_multicall_ents++;
-}
-
-int mfn_typer_flush_queue(mfn_typer_t *t)
-{
- if (t->nr_multicall_ents == 0) return 0;
- do_multicall_op(t->xc_handle, t->multicall_list, t->nr_multicall_ents);
- t->nr_multicall_ents = 0;
-}
-
-unsigned int mfn_typer_get_result(mfn_typer_t *t, int idx)
-{
- return t->gpf_list[idx].getpageframeinfo.type;
-}
-
-#endif
-
/* NB: arr must be mlock'ed */
-
int get_pfn_type_batch(int xc_handle,
u64 dom, int num, unsigned long *arr)
{
@@ -362,8 +193,10 @@ int add_mmu_update(int xc_handle, mmu_t *mmu,
{
mmu->updates[mmu->idx].ptr = ptr;
mmu->updates[mmu->idx].val = val;
+
if ( ++mmu->idx == MAX_MMU_UPDATES )
return flush_mmu_updates(xc_handle, mmu);
+
return 0;
}
diff --git a/tools/xc/py/Xc.c b/tools/xc/py/Xc.c
index 8641259c63..d2a291e4c8 100644
--- a/tools/xc/py/Xc.c
+++ b/tools/xc/py/Xc.c
@@ -380,7 +380,7 @@ static PyObject *pyxc_linux_restore(PyObject *self,
do {
rc = read( (int) fd, ((char*)buf)+tot, count-tot );
if ( rc < 0 ) { perror("READ"); return rc; }
- if ( rc == 0 ) { printf("read: need %d, tot=%d got zero\n"); return -1; }
+ if ( rc == 0 ) { printf("read: need %d, tot=%d got zero\n",count-tot,tot); return -1; }
tot += rc;
}
while ( tot < count );
@@ -1296,7 +1296,7 @@ static PyObject *pyxc_shadow_control(PyObject *self,
&dom, &op) )
return NULL;
- if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0) < 0 )
+ if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, NULL, NULL) < 0 )
return PyErr_SetFromErrno(xc_error);
Py_INCREF(zero);
diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c
index 586a3a1270..b4baf6c141 100644
--- a/xen/common/dom0_ops.c
+++ b/xen/common/dom0_ops.c
@@ -397,7 +397,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
{
ret = 0;
- op->u.getpageframeinfo.type = NONE;
+ op->u.getpageframeinfo.type = NOTAB;
if ( (page->type_and_flags & PGT_count_mask) != 0 )
{
@@ -645,11 +645,17 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
switch( page->type_and_flags & PGT_type_mask )
{
case PGT_l1_page_table:
+ type = L1TAB;
+ break;
case PGT_l2_page_table:
+ type = L2TAB;
+ break;
case PGT_l3_page_table:
+ type = L3TAB;
+ break;
case PGT_l4_page_table:
- type = page->type_and_flags & PGT_type_mask;
-
+ type = L4TAB;
+ break;
}
l_arr[j] |= type;
put_page(page);
@@ -657,7 +663,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
else
{
e2_err:
- l_arr[j] |= PGT_type_mask; /* error */
+ l_arr[j] |= XTAB;
}
}
diff --git a/xen/common/shadow.c b/xen/common/shadow.c
index f222419b25..d9e4d9aef8 100644
--- a/xen/common/shadow.c
+++ b/xen/common/shadow.c
@@ -33,12 +33,34 @@ hypercall lock anyhow (at least initially).
FIXME:
-1. Flush needs to avoid blowing away the L2 page that another CPU may be using!
-
-fix using cpu_raise_softirq
-
-have a flag to count in, (after switching to init's PTs)
-spinlock, reload cr3_shadow, unlock
+The shadow table flush command is dangerous on SMP systems as the
+guest may be using the L2 on one CPU while the other is trying to
+blow the table away.
+
+The current save restore code works around this by not calling FLUSH,
+but by calling CLEAN2 which leaves all L2s in tact (this is probably
+quicker anyhow).
+
+Even so, we have to be very careful. The flush code may need to cause
+a TLB flush on another CPU. It needs to do this while holding the
+shadow table lock. The trouble is, the guest may be in the shadow page
+fault handler spinning waiting to grab the shadow lock. It may have
+intterupts disabled, hence we can't use the normal flush_tlb_cpu
+mechanism.
+
+For the moment, we have a grim hace whereby the spinlock in the shadow
+fault handler is actually a try lock, in a loop with a helper for the
+tlb flush code.
+
+A better soloution would be to take a new flush lock, then raise a
+per-domain soft irq on the other CPU. The softirq will switch to
+init's PTs, then do an atomic inc of a variable to count himself in,
+then spin on a lock. Having noticed that the other guy has counted
+in, flush the shadow table, then release him by dropping the lock. He
+will then reload cr3 from mm.page_table on the way out of the softirq.
+
+In domian-softirq context we know that the guy holds no locks and has
+interrupts enabled. Nothing can go wrong ;-)
**/
@@ -364,6 +386,11 @@ static int shadow_mode_table_op( struct task_struct *p,
rc = -EINVAL;
goto out;
}
+
+ sc->fault_count = p->mm.shadow_fault_count;
+ sc->dirty_count = p->mm.shadow_dirty_count;
+ p->mm.shadow_fault_count = 0;
+ p->mm.shadow_dirty_count = 0;
sc->pages = p->tot_pages;
@@ -746,6 +773,8 @@ int shadow_fault( unsigned long va, long error_code )
perfc_incrc(shadow_fixup_count);
+ m->shadow_fault_count++;
+
check_pagetable( current, current->mm.pagetable, "post-sf" );
spin_unlock(&m->shadow_lock);
diff --git a/xen/include/asm-i386/processor.h b/xen/include/asm-i386/processor.h
index c2e36d6191..b1eab99d56 100644
--- a/xen/include/asm-i386/processor.h
+++ b/xen/include/asm-i386/processor.h
@@ -445,17 +445,27 @@ struct mm_struct {
l1_pgentry_t *perdomain_pt;
pagetable_t pagetable;
+ /* shadow mode status and controls */
unsigned int shadow_mode; /* flags to control shadow table operation */
pagetable_t shadow_table;
spinlock_t shadow_lock;
+ unsigned int shadow_max_page_count; // currently unused
+
+ /* shadow hashtable */
struct shadow_status *shadow_ht;
struct shadow_status *shadow_ht_free;
struct shadow_status *shadow_ht_extras; /* extra allocation units */
+ unsigned int shadow_extras_count;
+
+ /* shadow dirty bitmap */
unsigned long *shadow_dirty_bitmap;
unsigned int shadow_dirty_bitmap_size; /* in pages, bit per page */
- unsigned int shadow_page_count;
- unsigned int shadow_max_page_count;
- unsigned int shadow_extras_count;
+
+ /* shadow mode stats */
+ unsigned int shadow_page_count;
+ unsigned int shadow_fault_count;
+ unsigned int shadow_dirty_count;
+
/* Current LDT details. */
unsigned long ldt_base, ldt_ents, shadow_ldt_mapcnt;
diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h
index 2a17605bf2..6273878d16 100644
--- a/xen/include/hypervisor-ifs/dom0_ops.h
+++ b/xen/include/hypervisor-ifs/dom0_ops.h
@@ -150,6 +150,13 @@ typedef struct dom0_settime_st
} dom0_settime_t;
#define DOM0_GETPAGEFRAMEINFO 18
+#define NOTAB 0 /* normal page */
+#define L1TAB (1<<28)
+#define L2TAB (2<<28)
+#define L3TAB (3<<28)
+#define L4TAB (4<<28)
+#define XTAB (0xf<<28) /* invalid page */
+#define LTAB_MASK XTAB
typedef struct dom0_getpageframeinfo_st
{
/* IN variables. */
@@ -157,8 +164,7 @@ typedef struct dom0_getpageframeinfo_st
domid_t domain; /* To which domain does the frame belong? */
/* OUT variables. */
/* Is the page PINNED to a type? */
- enum { NONE, L1TAB=(1<<29), L2TAB=(2<<29), L3TAB=(3<<29), L4TAB=(4<<29) } type;
-#define PGT_type_mask (7<<29)
+ unsigned long type; /* see above type defs */
} dom0_getpageframeinfo_t;
@@ -251,6 +257,9 @@ typedef struct dom0_shadow_control_st
unsigned long *dirty_bitmap; // pointe to mlocked buffer
/* IN/OUT variables */
unsigned long pages; // size of buffer, updated with actual size
+ /* OUT varaibles */
+ unsigned long fault_count;
+ unsigned long dirty_count;
} dom0_shadow_control_t;
#define DOM0_SETDOMAINNAME 26
diff --git a/xen/include/xen/shadow.h b/xen/include/xen/shadow.h
index f1ce8b6689..1597e1feb2 100644
--- a/xen/include/xen/shadow.h
+++ b/xen/include/xen/shadow.h
@@ -95,8 +95,12 @@ printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
ASSERT(m->shadow_dirty_bitmap);
if( likely(pfn<m->shadow_dirty_bitmap_size) )
{
- /* These updates occur with mm.shadow_lock held */
- __set_bit( pfn, m->shadow_dirty_bitmap );
+ /* These updates occur with mm.shadow_lock held, so use
+ (__) version of test_and_set */
+ if( ! __test_and_set_bit( pfn, m->shadow_dirty_bitmap ) )
+ {
+ m->shadow_dirty_count++;
+ }
}
else
{
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c
index 5bb4f4f4b9..a4b01eb83f 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c
@@ -172,7 +172,7 @@ static int privcmd_ioctl(struct inode *inode, struct file *file,
v);
if ( unlikely(HYPERVISOR_mmu_update(u, v - u + 1, NULL) < 0) )
- put_user( 0xe0000000 | mfn, p );
+ put_user( 0xF0000000 | mfn, p );
v = w;
}