diff options
author | iap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk> | 2004-05-11 14:57:44 +0000 |
---|---|---|
committer | iap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk> | 2004-05-11 14:57:44 +0000 |
commit | 07588dc73f0be59b52b968f7b88e43798bbe5e51 (patch) | |
tree | 1efd062adfd783ffd2d9e6a9bbcc7cdc858c9cdf | |
parent | a8e33843ee074cd479e0cb69a7e379587cdca5bb (diff) | |
download | xen-07588dc73f0be59b52b968f7b88e43798bbe5e51.tar.gz xen-07588dc73f0be59b52b968f7b88e43798bbe5e51.tar.bz2 xen-07588dc73f0be59b52b968f7b88e43798bbe5e51.zip |
bitkeeper revision 1.896 (40a0e9e8M0uaTwE5LBe9sIhr2vdX7Q)
Live migration initial checkin.
32 files changed, 871 insertions, 346 deletions
diff --git a/tools/examples/xc_dom_control.py b/tools/examples/xc_dom_control.py index 4f0bd5de52..d6cae4f720 100755 --- a/tools/examples/xc_dom_control.py +++ b/tools/examples/xc_dom_control.py @@ -139,10 +139,12 @@ elif cmd == 'suspend': xc.domain_stop( dom=dom ) while not xc.domain_getinfo( first_dom=dom, max_doms=1 )[0]['stopped']: - time.sleep(0.1); + print "Sleep..." + time.sleep(0.001); rc = xc.linux_save( dom=dom, state_file=file, progress=1) if rc == 0 : xc.domain_destroy( dom=dom, force=1 ) + else: xc.domain_start( dom=dom ) # sensible for production use elif cmd == 'cpu_bvtslice': if len(sys.argv) < 3: diff --git a/tools/xc/lib/xc.h b/tools/xc/lib/xc.h index a0205bcc6b..2132d6e7c1 100644 --- a/tools/xc/lib/xc.h +++ b/tools/xc/lib/xc.h @@ -57,7 +57,10 @@ int xc_domain_getinfo(int xc_handle, int xc_shadow_control(int xc_handle, u64 domid, - unsigned int sop); + unsigned int sop, + unsigned long *dirty_bitmap, + unsigned long pages); + #define XCFLAGS_VERBOSE 1 #define XCFLAGS_LIVE 2 @@ -247,11 +250,6 @@ int xc_readconsolering(int xc_handle, int xc_physinfo(int xc_handle, xc_physinfo_t *info); - -int xc_shadow_control(int xc_handle, - u64 domid, - unsigned int sop); - int xc_domain_setname(int xc_handle, u64 domid, char *name); diff --git a/tools/xc/lib/xc_domain.c b/tools/xc/lib/xc_domain.c index c26a3f87c3..6d0dd6d0f3 100644 --- a/tools/xc/lib/xc_domain.c +++ b/tools/xc/lib/xc_domain.c @@ -109,13 +109,24 @@ int xc_domain_getinfo(int xc_handle, int xc_shadow_control(int xc_handle, u64 domid, - unsigned int sop) + unsigned int sop, + unsigned long *dirty_bitmap, + unsigned long pages) { + int rc; dom0_op_t op; op.cmd = DOM0_SHADOW_CONTROL; op.u.shadow_control.domain = (domid_t)domid; op.u.shadow_control.op = sop; - return do_dom0_op(xc_handle, &op); + op.u.shadow_control.dirty_bitmap = dirty_bitmap; + op.u.shadow_control.pages = pages; + + rc = do_dom0_op(xc_handle, &op); + + if ( rc == 0 ) + return op.u.shadow_control.pages; + else + return rc; } int xc_domain_setname(int xc_handle, diff --git a/tools/xc/lib/xc_linux_build.c b/tools/xc/lib/xc_linux_build.c index 83debd904d..98a3fb6a60 100644 --- a/tools/xc/lib/xc_linux_build.c +++ b/tools/xc/lib/xc_linux_build.c @@ -284,7 +284,7 @@ static int setup_guestos(int xc_handle, /* shared_info page starts its life empty. */ shared_info = map_pfn_writeable(pm_handle, shared_info_frame); - memset(shared_info, 0, PAGE_SIZE); + memset(shared_info, 0, sizeof(shared_info_t)); /* Mask all upcalls... */ for ( i = 0; i < MAX_VIRT_CPUS; i++ ) shared_info->vcpu_data[i].evtchn_upcall_mask = 1; diff --git a/tools/xc/lib/xc_linux_restore.c b/tools/xc/lib/xc_linux_restore.c index 861d38a5f7..9b1532159f 100644 --- a/tools/xc/lib/xc_linux_restore.c +++ b/tools/xc/lib/xc_linux_restore.c @@ -230,9 +230,16 @@ int xc_linux_restore(int xc_handle, goto out; } - //printf("batch=%d\n",j); + printf("batch %d\n",j); - if(j==0) break; // our work here is done + if (j == 0) + break; // our work here is done + + if( j > MAX_BATCH_SIZE ) + { + ERROR("Max batch size exceeded. Giving up."); + goto out; + } if ( (*readerfn)(readerst, region_pfn_type, j*sizeof(unsigned long)) ) { @@ -242,6 +249,9 @@ int xc_linux_restore(int xc_handle, for(i=0;i<j;i++) { + if ((region_pfn_type[i]>>29) == 7) + continue; + pfn = region_pfn_type[i] & ~PGT_type_mask; mfn = pfn_to_mfn_table[pfn]; @@ -261,9 +271,15 @@ int xc_linux_restore(int xc_handle, unsigned long *ppage; pfn = region_pfn_type[i] & ~PGT_type_mask; + +//if(n>=nr_pfns || ((region_pfn_type[i] & PGT_type_mask) == L2TAB) ) printf("pfn=%08lx mfn=%x\n",region_pfn_type[i],pfn_to_mfn_table[pfn]); + //if(pfn_type[i])printf("^pfn=%d %08lx\n",pfn,pfn_type[i]); + if ((region_pfn_type[i]>>29) == 7) + continue; + if (pfn>nr_pfns) { ERROR("pfn out of range"); @@ -304,7 +320,7 @@ int xc_linux_restore(int xc_handle, if ( xpfn >= nr_pfns ) { - ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=%d nr_pfns=%d",region_pfn_type[i]>>29,i,k,xpfn,nr_pfns); + ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=0x%x nr_pfns=%d",region_pfn_type[i]>>29,i,k,xpfn,nr_pfns); goto out; } #if 0 @@ -355,17 +371,19 @@ int xc_linux_restore(int xc_handle, default: ERROR("Bogus page type %x page table is out of range. i=%d nr_pfns=%d",region_pfn_type[i],i,nr_pfns); goto out; - } + + } // end of page type switch statement if ( add_mmu_update(xc_handle, mmu, (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, pfn) ) goto out; - } + } // end of 'batch' for loop n+=j; // crude stats } +printf("RECEIVED ALL PAGES\n"); mfn_mapper_close( region_mapper ); @@ -381,7 +399,10 @@ int xc_linux_restore(int xc_handle, (pfn_to_mfn_table[i]<<PAGE_SHIFT) | MMU_EXTENDED_COMMAND, MMUEXT_PIN_L1_TABLE) ) + { + printf("ERR pin L1 pfn=%lx mfn=%lx\n"); goto out; + } } else if ( pfn_type[i] == L2TAB ) { @@ -389,7 +410,10 @@ int xc_linux_restore(int xc_handle, (pfn_to_mfn_table[i]<<PAGE_SHIFT) | MMU_EXTENDED_COMMAND, MMUEXT_PIN_L2_TABLE) ) + { + printf("ERR pin L2 pfn=%lx mfn=%lx\n"); goto out; + } } } @@ -421,6 +445,8 @@ int xc_linux_restore(int xc_handle, p_srec->resume_info.flags = 0; unmap_pfn(pm_handle, p_srec); +printf("new shared info is %lx\n", shared_info_frame); + /* Uncanonicalise each GDT frame number. */ if ( ctxt.gdt_ents > 8192 ) { @@ -451,7 +477,7 @@ int xc_linux_restore(int xc_handle, /* Copy saved contents of shared-info page. No checking needed. */ ppage = map_pfn_writeable(pm_handle, shared_info_frame); - memcpy(ppage, shared_info, PAGE_SIZE); + memcpy(ppage, shared_info, sizeof(shared_info_t)); unmap_pfn(pm_handle, ppage); @@ -528,7 +554,9 @@ int xc_linux_restore(int xc_handle, op.u.builddomain.ctxt = &ctxt; rc = do_dom0_op(xc_handle, &op); +printf("NORMAL EXIT RESTORE\n"); out: +printf("EXIT RESTORE\n"); if ( mmu != NULL ) free(mmu); diff --git a/tools/xc/lib/xc_linux_save.c b/tools/xc/lib/xc_linux_save.c index 02e3ffc352..cbb1d66fd1 100644 --- a/tools/xc/lib/xc_linux_save.c +++ b/tools/xc/lib/xc_linux_save.c @@ -22,11 +22,17 @@ /* * Returns TRUE if the given machine frame number has a unique mapping * in the guest's pseudophysical map. + * 0x80000000-3 mark the shared_info, and blk/net rings */ #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \ - (((_mfn) < (1024*1024)) && \ - (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))) - + (((_mfn) < (1024*1024)) && \ + ( ( (live_mfn_to_pfn_table[_mfn] < nr_pfns) && \ + (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn)) ) || \ +\ + (live_mfn_to_pfn_table[_mfn] >= 0x80000000 && \ + live_mfn_to_pfn_table[_mfn] <= 0x80000003 ) || \ + live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == 0x80000004 ) ) + /* Returns TRUE if MFN is successfully converted to a PFN. */ #define translate_mfn_to_pfn(_pmfn) \ ({ \ @@ -40,6 +46,14 @@ }) +/* test_bit */ +inline int test_bit ( int nr, volatile void * addr) +{ + return ( ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >> + (nr % (sizeof(unsigned long)*8) ) ) & 1; +} + + int xc_linux_save(int xc_handle, u64 domid, unsigned int flags, @@ -47,14 +61,11 @@ int xc_linux_save(int xc_handle, void *writerst ) { dom0_op_t op; - int rc = 1, i, j, k, n; + int rc = 1, i, j, k, n, last_iter, iter = 0; unsigned long mfn; - unsigned int prev_pc, this_pc; int verbose = flags & XCFLAGS_VERBOSE; - //int live = flags & XCFLAGS_LIVE; - - /* state of the new MFN mapper */ - mfn_mapper_t *mapper_handle1, *mapper_handle2; + int live = 1; //flags & XCFLAGS_LIVE; // XXXXXXXXXXXXXXXXXXX + int sent_last_iter, sent_this_iter, max_iters; /* Remember if we stopped the guest, so we can restart it on exit. */ int we_stopped_it = 0; @@ -90,8 +101,13 @@ int xc_linux_save(int xc_handle, unsigned char *region_base; /* A temporary mapping, and a copy, of the guest's suspend record. */ - suspend_record_t *p_srec, srec; + suspend_record_t *p_srec; + + /* number of pages we're dealing with */ + unsigned long nr_pfns; + /* bitmap of pages left to send */ + unsigned long *to_send; if ( mlock(&ctxt, sizeof(ctxt) ) ) { @@ -129,21 +145,24 @@ int xc_linux_save(int xc_handle, goto out; } - sleep(1); + usleep(1000); // 1ms + printf("Sleep for 1ms\n"); } +#if 1 /* A cheesy test to see whether the domain contains valid state. */ if ( ctxt.pt_base == 0 ) { ERROR("Domain is not in a valid Linux guest OS state"); goto out; } +#endif /* Map the suspend-record MFN to pin it. The page must be owned by domid for this to succeed. */ p_srec = mfn_mapper_map_single(xc_handle, domid, - sizeof(srec), PROT_READ, + sizeof(*p_srec), PROT_READ, ctxt.cpu_ctxt.esi ); if (!p_srec) @@ -152,10 +171,10 @@ int xc_linux_save(int xc_handle, goto out; } - memcpy( &srec, p_srec, sizeof(srec) ); + nr_pfns = p_srec->nr_pfns; /* cheesy sanity check */ - if ( srec.nr_pfns > 1024*1024 ) + if ( nr_pfns > 1024*1024 ) { ERROR("Invalid state record -- pfn count out of range"); goto out; @@ -165,55 +184,13 @@ int xc_linux_save(int xc_handle, live_pfn_to_mfn_frame_list = mfn_mapper_map_single(xc_handle, domid, PAGE_SIZE, PROT_READ, - srec.pfn_to_mfn_frame_list ); + p_srec->pfn_to_mfn_frame_list ); if (!live_pfn_to_mfn_frame_list) { ERROR("Couldn't map pfn_to_mfn_frame_list"); goto out; } - - - if ( (mapper_handle1 = mfn_mapper_init(xc_handle, domid, - 1024*1024, PROT_READ )) - == NULL ) - goto out; - - for ( i = 0; i < (srec.nr_pfns+1023)/1024; i++ ) - { - /* Grab a copy of the pfn-to-mfn table frame list. - This has the effect of preventing the page from being freed and - given to another domain. (though the domain is stopped anyway...) */ - mfn_mapper_queue_entry( mapper_handle1, i<<PAGE_SHIFT, - live_pfn_to_mfn_frame_list[i], - PAGE_SIZE ); - } - - if ( mfn_mapper_flush_queue(mapper_handle1) ) - { - ERROR("Couldn't map pfn_to_mfn table"); - goto out; - } - - live_pfn_to_mfn_table = mfn_mapper_base( mapper_handle1 ); - - - - /* We want zeroed memory so use calloc rather than malloc. */ - pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long)); - - if ( (pfn_type == NULL) ) - { - errno = ENOMEM; - goto out; - } - - if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ) - { - ERROR("Unable to mlock"); - goto out; - } - /* Track the mfn_to_pfn table down from the domains PT */ { @@ -233,49 +210,106 @@ int xc_linux_save(int xc_handle, mfn_to_pfn_table_start_mfn ); } + /* Map all the frames of the pfn->mfn table. For migrate to succeed, + the guest must not change which frames are used for this purpose. + (its not clear why it would want to change them, and we'll be OK + from a safety POV anyhow. */ - /* - * Quick belt and braces sanity check. - */ + live_pfn_to_mfn_table = mfn_mapper_map_batch( xc_handle, domid, + PROT_READ, + live_pfn_to_mfn_frame_list, + (nr_pfns+1023)/1024 ); + if( !live_pfn_to_mfn_table ) + { + PERROR("Couldn't map pfn_to_mfn table"); + goto out; + } + + for(i=0;i<(nr_pfns+1023)/1024 ;i++) + printf("LF: %d %x\n",i,live_pfn_to_mfn_frame_list[i]); - for ( i = 0; i < srec.nr_pfns; i++ ) + + /* At this point, we can start the domain again if we're doign a + live suspend */ + + if( live ) + { +#if 1 + if ( xc_shadow_control( xc_handle, domid, + DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY, + NULL, 0 ) < 0 ) + { + ERROR("Couldn't enable shadow mode"); + goto out; + } +#endif + if ( xc_domain_start( xc_handle, domid ) < 0 ) + { + ERROR("Couldn't restart domain"); + goto out; + } +//exit(-1); + last_iter = 0; + sent_last_iter = 1<<20; // 4GB's worth of pages + max_iters = 8; // limit us to 9 time round loop + } + else + last_iter = 1; + + + /* Setup to_send bitmap */ { - mfn = live_pfn_to_mfn_table[i]; + int sz = (nr_pfns/8) + 8; // includes slop at end of array + + to_send = malloc( sz ); - if( live_mfn_to_pfn_table[mfn] != i ) - printf("i=%d mfn=%d live_mfn_to_pfn_table=%d\n", - i,mfn,live_mfn_to_pfn_table[mfn]); + if (!to_send) + { + ERROR("Couldn't allocate to_send array"); + goto out; + } + memset( to_send, 0xff, sz ); + + if ( mlock( to_send, sz ) ) + { + PERROR("Unable to mlock to_send"); + return 1; + } } - /* Canonicalise the suspend-record frame number. */ - if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) ) + /* We want zeroed memory so use calloc rather than malloc. */ + pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long)); + + if ( (pfn_type == NULL) ) { - ERROR("State record is not in range of pseudophys map"); + errno = ENOMEM; goto out; } - /* Canonicalise each GDT frame number. */ - for ( i = 0; i < ctxt.gdt_ents; i += 512 ) + if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ) { - if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) - { - ERROR("GDT frame is not in range of pseudophys map"); - goto out; - } + ERROR("Unable to mlock"); + goto out; } - /* Canonicalise the page table base pointer. */ - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.pt_base >> PAGE_SHIFT) ) + + /* + * Quick belt and braces sanity check. + */ + + for ( i = 0; i < nr_pfns; i++ ) { - ERROR("PT base is not in range of pseudophys map"); - goto out; + mfn = live_pfn_to_mfn_table[i]; + + if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0x80000004) ) + printf("i=0x%x mfn=%x live_mfn_to_pfn_table=%x\n", + i,mfn,live_mfn_to_pfn_table[mfn]); } - ctxt.pt_base = live_mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] << PAGE_SHIFT; /* Canonicalise the pfn-to-mfn table frame-number list. */ memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE ); - for ( i = 0; i < srec.nr_pfns; i += 1024 ) + for ( i = 0; i < nr_pfns; i += 1024 ) { if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ) { @@ -284,7 +318,7 @@ int xc_linux_save(int xc_handle, } } - /* Start writing out the saved-domain record. */ + /* Map the shared info frame */ live_shinfo = mfn_mapper_map_single(xc_handle, domid, PAGE_SIZE, PROT_READ, shared_info_frame); @@ -295,164 +329,290 @@ int xc_linux_save(int xc_handle, goto out; } + /* Start writing out the saved-domain record. */ + if ( (*writerfn)(writerst, "LinuxGuestRecord", 16) || (*writerfn)(writerst, name, sizeof(name)) || - (*writerfn)(writerst, &srec.nr_pfns, sizeof(unsigned long)) || - (*writerfn)(writerst, &ctxt, sizeof(ctxt)) || - (*writerfn)(writerst, live_shinfo, PAGE_SIZE) || + (*writerfn)(writerst, &nr_pfns, sizeof(unsigned long)) || (*writerfn)(writerst, pfn_to_mfn_frame_list, PAGE_SIZE) ) { ERROR("Error when writing to state file (1)"); goto out; } - munmap(live_shinfo, PAGE_SIZE); - - verbose_printf("Saving memory pages: 0%%"); - - if ( (mapper_handle2 = mfn_mapper_init(xc_handle, domid, - BATCH_SIZE*4096, PROT_READ )) - == NULL ) - goto out; - - region_base = mfn_mapper_base( mapper_handle2 ); /* Now write out each data page, canonicalising page tables as we go... */ - prev_pc = 0; - for ( n = 0; n < srec.nr_pfns; ) + + while(1) { - this_pc = (n * 100) / srec.nr_pfns; - if ( (this_pc - prev_pc) >= 5 ) - { - verbose_printf("\b\b\b\b%3d%%", this_pc); - prev_pc = this_pc; - } + unsigned int prev_pc, batch, sent_this_iter; - for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ ) - { - pfn_type[j] = live_pfn_to_mfn_table[i]; - } + iter++; + sent_this_iter = 0; + prev_pc = 0; + verbose_printf("Saving memory pages: iter %d 0%%", iter); - for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ ) + n=0; + while( n < nr_pfns ) { - /* queue up mappings for all of the pages in this batch */ + unsigned int this_pc = (n * 100) / nr_pfns; + if ( (this_pc - prev_pc) >= 5 ) + { + verbose_printf("\b\b\b\b%3d%%", this_pc); + prev_pc = this_pc; + } -//printf("region n=%d j=%d i=%d mfn=%d\n",n,j,i,live_pfn_to_mfn_table[i]); - mfn_mapper_queue_entry( mapper_handle2, j<<PAGE_SHIFT, - live_pfn_to_mfn_table[i], - PAGE_SIZE ); - } - if( mfn_mapper_flush_queue(mapper_handle2) ) - { - ERROR("Couldn't map page region"); - goto out; - } + /* load pfn_type[] with the mfn of all the pages we're doing in + this batch. */ - if ( get_pfn_type_batch(xc_handle, domid, j, pfn_type) ) - { - ERROR("get_pfn_type_batch failed"); - goto out; - } - - for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ ) - { - if((pfn_type[j]>>29) == 7) + for( batch = 0; batch < BATCH_SIZE && n < nr_pfns ; n++ ) { - ERROR("bogus page"); - goto out; - } + if ( !test_bit(n, to_send ) ) continue; - /* canonicalise mfn->pfn */ - pfn_type[j] = (pfn_type[j] & PGT_type_mask) | - live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask]; - -/* if(pfn_type[j]>>29) - printf("i=%d type=%d\n",i,pfn_type[i]); */ - } + pfn_type[batch] = live_pfn_to_mfn_table[n]; + if( pfn_type[batch] == 0x80000004 ) + { + //printf("Skip netbuf pfn %lx. mfn %lx\n",n,pfn_type[batch]); + continue; + } - if ( (*writerfn)(writerst, &j, sizeof(int) ) ) - { - ERROR("Error when writing to state file (2)"); - goto out; - } +//if(iter>1) printf("pfn=%x mfn=%x\n",n,pfn_type[batch]); + + batch++; + } - if ( (*writerfn)(writerst, pfn_type, sizeof(unsigned long)*j ) ) - { - ERROR("Error when writing to state file (3)"); - goto out; - } + for( j = 0; j < batch; j++ ) + { + if( (pfn_type[j] &0xfffff) == 0x0000004 ) + { + printf("XXXXXXXXSkip netbuf entry %d mfn %lx\n",j,pfn_type[j]); + } - for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ ) - { - /* write out pages in batch */ + + } - if ( ((pfn_type[j] & PGT_type_mask) == L1TAB) || - ((pfn_type[j] & PGT_type_mask) == L2TAB) ) + + printf("batch %d:%d (n=%d)\n",iter,batch,n); + + if(batch == 0) goto skip; // vanishingly unlikely... + + if ( (region_base = mfn_mapper_map_batch( xc_handle, domid, + PROT_READ, + pfn_type, + batch )) == 0) + { + PERROR("map batch failed"); + goto out; + } + + if ( get_pfn_type_batch(xc_handle, domid, batch, pfn_type) ) { + ERROR("get_pfn_type_batch failed"); + goto out; + } + + for( j = 0; j < batch; j++ ) + { + if((pfn_type[j]>>29) == 7) + { + //printf("type fail: page %i mfn %08lx\n",j,pfn_type[j]); + continue; + } +//if((pfn_type[j] & PGT_type_mask) == L2TAB) printf("L2 pfn=%08lx mfn=%lx\n",pfn_type[j],live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask]); - memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE); + /* canonicalise mfn->pfn */ + pfn_type[j] = (pfn_type[j] & PGT_type_mask) | + live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask]; + } + + + if ( (*writerfn)(writerst, &batch, sizeof(int) ) ) + { + ERROR("Error when writing to state file (2)"); + goto out; + } - for ( k = 0; - k < (((pfn_type[j] & PGT_type_mask) == L2TAB) ? - (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024); - k++ ) + if ( (*writerfn)(writerst, pfn_type, sizeof(unsigned long)*j ) ) + { + ERROR("Error when writing to state file (3)"); + goto out; + } + + /* entering this loop, pfn_type is now in pfns (Not mfns) */ + for( j = 0; j < batch; j++ ) + { + /* write out pages in batch */ + + if((pfn_type[j]>>29) == 7) { - if ( !(page[k] & _PAGE_PRESENT) ) continue; - mfn = page[k] >> PAGE_SHIFT; - - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) + //printf("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]); + continue; + } + + if ( ((pfn_type[j] & PGT_type_mask) == L1TAB) || + ((pfn_type[j] & PGT_type_mask) == L2TAB) ) + { + + memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE); + + for ( k = 0; + k < (((pfn_type[j] & PGT_type_mask) == L2TAB) ? + (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024); + k++ ) { - ERROR("Frame number in pagetable page is invalid"); + unsigned long pfn; + + if ( !(page[k] & _PAGE_PRESENT) ) continue; + mfn = page[k] >> PAGE_SHIFT; + pfn = live_mfn_to_pfn_table[mfn]; + + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) + { + printf("FNI %d : [%08lx,%d] pte=%08lx, mfn=%08lx, pfn=%08lx [mfn]=%08lx\n", + j, pfn_type[j], k, + page[k], mfn, live_mfn_to_pfn_table[mfn], + (live_mfn_to_pfn_table[mfn]<nr_pfns)? + live_pfn_to_mfn_table[live_mfn_to_pfn_table[mfn]]: 0xdeadbeef); + pfn = 0; // be suspicious + +// ERROR("Frame number in pagetable page is invalid"); +// goto out; + + + } + page[k] &= PAGE_SIZE - 1; + page[k] |= pfn << PAGE_SHIFT; + + /* + printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx xpfn=%d\n", + pfn_type[j]>>29, + j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT); + */ + + } /* end of page table rewrite for loop */ + + if ( (*writerfn)(writerst, page, PAGE_SIZE) ) + { + ERROR("Error when writing to state file (4)"); + goto out; + } + + } /* end of it's a PT page */ + else + { /* normal page */ + if ( (*writerfn)(writerst, region_base + (PAGE_SIZE*j), PAGE_SIZE) ) + { + ERROR("Error when writing to state file (5)"); goto out; } - page[k] &= PAGE_SIZE - 1; - page[k] |= live_mfn_to_pfn_table[mfn] << PAGE_SHIFT; - - /* - printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx xpfn=%d\n", - pfn_type[j]>>29, - j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT); - */ - } + } /* end of the write out for this batch */ + + sent_this_iter += batch; - if ( (*writerfn)(writerst, page, PAGE_SIZE) ) - { - ERROR("Error when writing to state file (4)"); - goto out; - } + } /* end of this while loop for this iteration */ + munmap(region_base, batch*PAGE_SIZE); + + skip: + + verbose_printf("\b\b\b\b100%% (%d pages)\n", sent_this_iter ); + + if ( last_iter ) + break; + if ( live ) + { + if ( sent_this_iter < (sent_last_iter * 0.95) && iter < max_iters ) + { + // we seem to be doing OK, keep going } else { - if ( (*writerfn)(writerst, region_base + (PAGE_SIZE*j), PAGE_SIZE) ) - { - ERROR("Error when writing to state file (5)"); - goto out; - } + printf("Start last iteration\n"); + last_iter = 1; + + xc_domain_stop_sync( xc_handle, domid ); + + } + + if ( xc_shadow_control( xc_handle, domid, + DOM0_SHADOW_CONTROL_OP_CLEAN, + to_send, nr_pfns ) != nr_pfns ) + { + ERROR("Error flushing shadow PT"); + goto out; } + +#if 0 + if(last_iter) memset(to_send, 0xff, (nr_pfns+7)/8 ); +#endif + + sent_last_iter = sent_this_iter; } - - n+=j; /* i is the master loop counter */ - } - verbose_printf("\b\b\b\b100%%\nMemory saved.\n"); + + } /* end of while 1 */ + +printf("All memory is saved\n"); /* Success! */ rc = 0; - + /* Zero terminate */ if ( (*writerfn)(writerst, &rc, sizeof(int)) ) { ERROR("Error when writing to state file (6)"); goto out; } - + /* Get the final execution context */ + op.cmd = DOM0_GETDOMAININFO; + op.u.getdomaininfo.domain = (domid_t)domid; + op.u.getdomaininfo.ctxt = &ctxt; + if ( (do_dom0_op(xc_handle, &op) < 0) || + ((u64)op.u.getdomaininfo.domain != domid) ) + { + PERROR("Could not get info on domain"); + goto out; + } +printf("A\n"); + /* Canonicalise the suspend-record frame number. */ + if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) ) + { + ERROR("State record is not in range of pseudophys map"); + goto out; + } +printf("B\n"); + /* Canonicalise each GDT frame number. */ + for ( i = 0; i < ctxt.gdt_ents; i += 512 ) + { + if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) + { + ERROR("GDT frame is not in range of pseudophys map"); + goto out; + } + } +printf("C\n"); + /* Canonicalise the page table base pointer. */ + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.pt_base >> PAGE_SHIFT) ) + { + ERROR("PT base is not in range of pseudophys map"); + goto out; + } + ctxt.pt_base = live_mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] << PAGE_SHIFT; +printf("D\n"); + if ( (*writerfn)(writerst, &ctxt, sizeof(ctxt)) || + (*writerfn)(writerst, live_shinfo, PAGE_SIZE) ) + { + ERROR("Error when writing to state file (1)"); + goto out; + } + munmap(live_shinfo, PAGE_SIZE); +printf("E\n"); out: /* Restart the domain if we had to stop it to save its state. */ if ( we_stopped_it ) diff --git a/tools/xc/lib/xc_private.c b/tools/xc/lib/xc_private.c index d137176ca8..47931f28ec 100644 --- a/tools/xc/lib/xc_private.c +++ b/tools/xc/lib/xc_private.c @@ -47,6 +47,31 @@ void unmap_pfn(int pm_handle, void *vaddr) /*******************/ +void * mfn_mapper_map_batch(int xc_handle, domid_t dom, int prot, + unsigned long *arr, int num ) +{ + privcmd_mmapbatch_t ioctlx; + void *addr; + addr = mmap( NULL, num*PAGE_SIZE, prot, MAP_SHARED, xc_handle, 0 ); + if (addr) + { + ioctlx.num=num; + ioctlx.dom=dom; + ioctlx.addr=(unsigned long)addr; + ioctlx.arr=arr; + if ( ioctl( xc_handle, IOCTL_PRIVCMD_MMAPBATCH, &ioctlx ) <0 ) + { + perror("XXXXXXXX"); + munmap(addr, num*PAGE_SIZE); + return 0; + } + } + return addr; + +} + +/*******************/ + void * mfn_mapper_map_single(int xc_handle, domid_t dom, int size, int prot, unsigned long mfn ) @@ -64,7 +89,10 @@ void * mfn_mapper_map_single(int xc_handle, domid_t dom, entry.mfn=mfn; entry.npages=(size+PAGE_SIZE-1)>>PAGE_SHIFT; if ( ioctl( xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx ) <0 ) + { + munmap(addr, size); return 0; + } } return addr; } @@ -295,7 +323,7 @@ static int flush_mmu_updates(int xc_handle, mmu_t *mmu) hypercall.op = __HYPERVISOR_mmu_update; hypercall.arg[0] = (unsigned long)mmu->updates; - hypercall.arg[1] = (unsigned long)mmu->idx; + hypercall.arg[1] = (unsigned long)&(mmu->idx); if ( mlock(mmu->updates, sizeof(mmu->updates)) != 0 ) { @@ -342,3 +370,47 @@ int finish_mmu_updates(int xc_handle, mmu_t *mmu) { return flush_mmu_updates(xc_handle, mmu); } + + +/***********************************************************/ + +/* this function is a hack until we get proper synchronous domain stop */ + +int xc_domain_stop_sync( int xc_handle, domid_t domid ) +{ + dom0_op_t op; + + while (1) + { + op.cmd = DOM0_STOPDOMAIN; + op.u.stopdomain.domain = (domid_t)domid; + if ( do_dom0_op(xc_handle, &op) != 0 ) + { + PERROR("Stopping target domain failed"); + goto out; + } + + usleep(1000); // 1ms + printf("Sleep for 1ms\n"); + + op.cmd = DOM0_GETDOMAININFO; + op.u.getdomaininfo.domain = (domid_t)domid; + op.u.getdomaininfo.ctxt = NULL; + if ( (do_dom0_op(xc_handle, &op) < 0) || + ((u64)op.u.getdomaininfo.domain != domid) ) + { + PERROR("Could not get info on domain"); + goto out; + } + + if ( op.u.getdomaininfo.state == DOMSTATE_STOPPED ) + { + printf("Domain %lld stopped\n",domid); + return 0; + } + + } + +out: + return -1; +} diff --git a/tools/xc/lib/xc_private.h b/tools/xc/lib/xc_private.h index 3a2e3ea9f1..e3eff85e59 100644 --- a/tools/xc/lib/xc_private.h +++ b/tools/xc/lib/xc_private.h @@ -232,6 +232,9 @@ typedef struct mfn_mapper { void * mfn_mapper_map_single(int xc_handle, domid_t dom, int size, int prot, unsigned long mfn ); +void * mfn_mapper_map_batch(int xc_handle, domid_t dom, int prot, + unsigned long *arr, int num ); + mfn_mapper_t * mfn_mapper_init(int xc_handle, domid_t dom, int size, int prot); void * mfn_mapper_base(mfn_mapper_t *t); @@ -245,5 +248,6 @@ void * mfn_mapper_queue_entry(mfn_mapper_t *t, int offset, /*********************/ +int xc_domain_stop_sync( int xc_handle, domid_t dom ); #endif /* __XC_PRIVATE_H__ */ diff --git a/tools/xc/py/Xc.c b/tools/xc/py/Xc.c index 929e9f3104..97bff12492 100644 --- a/tools/xc/py/Xc.c +++ b/tools/xc/py/Xc.c @@ -190,16 +190,17 @@ static PyObject *pyxc_linux_save(PyObject *self, u64 dom; char *state_file; - int progress = 1; + int progress = 1, live = 0; unsigned int flags = 0; - static char *kwd_list[] = { "dom", "state_file", "progress", NULL }; + static char *kwd_list[] = { "dom", "state_file", "progress", "live", NULL }; - if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Ls|i", kwd_list, - &dom, &state_file, &progress) ) + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Ls|ii", kwd_list, + &dom, &state_file, &progress, &live) ) return NULL; if (progress) flags |= XCFLAGS_VERBOSE; + if (live) flags |= XCFLAGS_LIVE; if (strncmp(state_file,"tcp:", strlen("tcp:")) == 0) { @@ -1273,7 +1274,7 @@ static PyObject *pyxc_shadow_control(PyObject *self, &dom, &op) ) return NULL; - if ( xc_shadow_control(xc->xc_handle, dom, op) != 0 ) + if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0) < 0 ) return PyErr_SetFromErrno(xc_error); Py_INCREF(zero); diff --git a/tools/xend/lib/utils.c b/tools/xend/lib/utils.c index 297976e9be..441b62f153 100644 --- a/tools/xend/lib/utils.c +++ b/tools/xend/lib/utils.c @@ -723,6 +723,11 @@ static PyObject *xu_port_new(PyObject *self, PyObject *args) goto fail4; } + xup->interface->tx_resp_prod = 0; + xup->interface->rx_req_prod = 0; + xup->interface->tx_req_prod = 0; + xup->interface->rx_resp_prod = 0; + xup->tx_req_cons = 0; xup->tx_resp_prod = 0; xup->rx_req_prod = 0; diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c index 9370a61a8d..dee7552bdd 100644 --- a/xen/common/dom0_ops.c +++ b/xen/common/dom0_ops.c @@ -525,10 +525,10 @@ long do_dom0_op(dom0_op_t *u_dom0_op) p = find_domain_by_id( op->u.shadow_control.domain ); if ( p ) { - ret = shadow_mode_control(p, op->u.shadow_control.op ); + ret = shadow_mode_control(p, &op->u.shadow_control ); put_task_struct(p); - } - + copy_to_user(u_dom0_op, op, sizeof(*op)); + } } break; diff --git a/xen/common/domain.c b/xen/common/domain.c index a9c40ae98f..b9e8150bfb 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -89,9 +89,15 @@ struct task_struct *do_createdomain(domid_t dom_id, unsigned int cpu) memset(p->shared_info, 0, PAGE_SIZE); SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), p); + machine_to_phys_mapping[virt_to_phys(p->shared_info) >> PAGE_SHIFT] = + 0x80000000UL; // set m2p table to magic marker (helps debug) + p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL); memset(p->mm.perdomain_pt, 0, PAGE_SIZE); + machine_to_phys_mapping[virt_to_phys(p->mm.perdomain_pt) >> PAGE_SHIFT] = + 0x0fffdeadUL; // set m2p table to magic marker (helps debug) + init_blkdev_info(p); /* Per-domain PCI-device list. */ @@ -486,6 +492,7 @@ void free_all_dom_mem(struct task_struct *p) unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes) { unsigned int alloc_pfns, nr_pages; + struct pfn_info *page; nr_pages = (kbytes + ((PAGE_SIZE-1)>>10)) >> (PAGE_SHIFT - 10); p->max_pages = nr_pages; /* this can now be controlled independently */ @@ -493,13 +500,16 @@ unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes) /* grow the allocation if necessary */ for ( alloc_pfns = p->tot_pages; alloc_pfns < nr_pages; alloc_pfns++ ) { - if ( unlikely(alloc_domain_page(p) == NULL) || + if ( unlikely((page=alloc_domain_page(p)) == NULL) || unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10))) ) { free_all_dom_mem(p); return -ENOMEM; } + + /* initialise to machine_to_phys_mapping table to likely pfn */ + machine_to_phys_mapping[page-frame_table] = alloc_pfns; } p->tot_pages = nr_pages; diff --git a/xen/common/memory.c b/xen/common/memory.c index ed2e5b6e17..243875f22e 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -213,7 +213,12 @@ void __init init_frametable(unsigned long nr_pages) belonging to the machine_to_phys_mapping to CPU0 idle task */ mfn = virt_to_phys((void *)RDWR_MPT_VIRT_START)>>PAGE_SHIFT; -// for(i=0;i<nr_pages;i+=1024,mfn++) + + /* initialise to a magic of 0x55555555 so easier to spot bugs later */ + memset( machine_to_phys_mapping, 0x55, 4*1024*1024 ); + + /* The array is sized for a 4GB machine regardless of actuall mem size. + This costs 4MB -- may want to fix some day */ for(i=0;i<1024*1024;i+=1024,mfn++) { frame_table[mfn].count_and_flags = 1 | PGC_allocated; @@ -325,7 +330,7 @@ static int get_page_from_pagenr(unsigned long page_nr, struct task_struct *p) if ( unlikely(!get_page(page, p)) ) { - MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr); + MEM_LOG("Could not get page ref for pfn %08lx", page_nr); return 0; } @@ -944,8 +949,9 @@ static int do_extended_command(unsigned long ptr, unsigned long val) } -int do_mmu_update(mmu_update_t *ureqs, int count) +int do_mmu_update(mmu_update_t *ureqs, int * p_count) { + int count; mmu_update_t req; unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0; struct pfn_info *page; @@ -954,6 +960,11 @@ int do_mmu_update(mmu_update_t *ureqs, int count) unsigned long prev_spfn = 0; l1_pgentry_t *prev_spl1e = 0; + if ( unlikely( get_user(count, p_count) ) ) + { + return -EFAULT; + } + perfc_incrc(calls_to_mmu_update); perfc_addc(num_page_updates, count); @@ -1110,6 +1121,9 @@ int do_mmu_update(mmu_update_t *ureqs, int count) percpu_info[cpu].gps = percpu_info[cpu].pts = NULL; } + if ( unlikely(rc) ) + put_user( count, p_count ); + return rc; } diff --git a/xen/common/network.c b/xen/common/network.c index 2f9051d9e5..befc929474 100644 --- a/xen/common/network.c +++ b/xen/common/network.c @@ -111,6 +111,9 @@ net_vif_t *create_net_vif(domid_t dom) clear_page(new_ring); SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), p); + machine_to_phys_mapping[virt_to_phys(new_ring)>>PAGE_SHIFT] = + 0x80000001; // magic value aids debugging + /* * Fill in the new vif struct. Note that, while the vif's refcnt is * non-zero, we hold a reference to the task structure. diff --git a/xen/common/shadow.c b/xen/common/shadow.c index 1144c0e65e..fe142e3ee9 100644 --- a/xen/common/shadow.c +++ b/xen/common/shadow.c @@ -123,6 +123,7 @@ static inline int shadow_page_op( struct mm_struct *m, unsigned int op, } return work; } + static void __scan_shadow_table( struct mm_struct *m, unsigned int op ) { int j, work=0; @@ -150,7 +151,7 @@ static void __scan_shadow_table( struct mm_struct *m, unsigned int op ) } shadow_audit(m,0); } - SH_LOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages)); + SH_VLOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages)); } @@ -160,7 +161,6 @@ int shadow_mode_enable( struct task_struct *p, unsigned int mode ) struct shadow_status **fptr; int i; - spin_lock_init(&m->shadow_lock); spin_lock(&m->shadow_lock); @@ -217,7 +217,6 @@ int shadow_mode_enable( struct task_struct *p, unsigned int mode ) // call shadow_mk_pagetable shadow_mk_pagetable( m ); - return 0; nomem: @@ -260,9 +259,12 @@ void shadow_mode_disable( struct task_struct *p ) kfree( &m->shadow_ht[0] ); } -static void shadow_mode_table_op( struct task_struct *p, unsigned int op ) +static int shadow_mode_table_op( struct task_struct *p, + dom0_shadow_control_t *sc ) { + unsigned int op = sc->op; struct mm_struct *m = &p->mm; + int rc = 0; // since Dom0 did the hypercall, we should be running with it's page // tables right now. Calling flush on yourself would be really @@ -271,13 +273,13 @@ static void shadow_mode_table_op( struct task_struct *p, unsigned int op ) if ( m == ¤t->mm ) { printk("Don't try and flush your own page tables!\n"); - return; + return -EINVAL; } spin_lock(&m->shadow_lock); - SH_LOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count); + SH_VLOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count); shadow_audit(m,1); @@ -288,27 +290,60 @@ static void shadow_mode_table_op( struct task_struct *p, unsigned int op ) break; case DOM0_SHADOW_CONTROL_OP_CLEAN: - __scan_shadow_table( m, op ); - // we used to bzero dirty bitmap here, but now leave this to user space - // if we were double buffering we'd do the flip here + { + int i; + + __scan_shadow_table( m, op ); + + if( p->tot_pages > sc->pages || + !sc->dirty_bitmap || !p->mm.shadow_dirty_bitmap ) + { + rc = -EINVAL; + goto out; + } + + sc->pages = p->tot_pages; + +#define chunk (8*1024) // do this in 1KB chunks for L1 cache + + for(i=0;i<p->tot_pages;i+=chunk) + { + int bytes = (( ((p->tot_pages-i) > (chunk))? + (chunk):(p->tot_pages-i) ) + 7) / 8; + + copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))), + p->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))), + bytes ); + + memset( p->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))), + 0, bytes); + } + break; + } } + +out: + spin_unlock(&m->shadow_lock); - SH_LOG("shadow mode table op : page count %d", m->shadow_page_count); + SH_VLOG("shadow mode table op : page count %d", m->shadow_page_count); shadow_audit(m,1); // call shadow_mk_pagetable shadow_mk_pagetable( m ); + return rc; } -int shadow_mode_control( struct task_struct *p, unsigned int op ) +int shadow_mode_control( struct task_struct *p, dom0_shadow_control_t *sc ) { int we_paused = 0; + unsigned int cmd = sc->op; + int rc = 0; // don't call if already shadowed... @@ -321,18 +356,23 @@ int shadow_mode_control( struct task_struct *p, unsigned int op ) we_paused = 1; } - if ( p->mm.shadow_mode && op == DOM0_SHADOW_CONTROL_OP_OFF ) + if ( p->mm.shadow_mode && cmd == DOM0_SHADOW_CONTROL_OP_OFF ) { shadow_mode_disable(p); } - else if ( op == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST ) + else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST ) { if(p->mm.shadow_mode) shadow_mode_disable(p); shadow_mode_enable(p, SHM_test); } - else if ( p->mm.shadow_mode && op >= DOM0_SHADOW_CONTROL_OP_FLUSH && op<=DOM0_SHADOW_CONTROL_OP_CLEAN ) + else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY ) { - shadow_mode_table_op(p, op); + if(p->mm.shadow_mode) shadow_mode_disable(p); + shadow_mode_enable(p, SHM_logdirty); + } + else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN ) + { + rc = shadow_mode_table_op(p, sc); } else { @@ -341,7 +381,7 @@ int shadow_mode_control( struct task_struct *p, unsigned int op ) } if ( we_paused ) wake_up(p); - return 0; + return rc; } diff --git a/xen/drivers/block/xen_block.c b/xen/drivers/block/xen_block.c index f44902b1c9..6901262cb8 100644 --- a/xen/drivers/block/xen_block.c +++ b/xen/drivers/block/xen_block.c @@ -19,6 +19,7 @@ #include <xen/interrupt.h> #include <xen/vbd.h> #include <xen/slab.h> +#include <xen/shadow.h> /* * These are rather arbitrary. They are fairly large because adjacent requests @@ -358,9 +359,18 @@ static void unlock_buffer(unsigned long buffer, pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT); pfn++ ) { + + /* Find the domain from the frame_table. Yuk... */ + struct task_struct *p = frame_table[pfn].u.domain; + + if( p->mm.shadow_mode == SHM_logdirty ) + mark_dirty( &p->mm, pfn ); + + if ( writeable_buffer ) put_page_type(&frame_table[pfn]); put_page(&frame_table[pfn]); + } } @@ -597,6 +607,10 @@ void init_blkdev_info(struct task_struct *p) p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL); clear_page(p->blk_ring_base); SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p); + + machine_to_phys_mapping[virt_to_phys(p->blk_ring_base)>>PAGE_SHIFT] = + 0x80000002; // magic value aids debugging + p->blkdev_list.next = NULL; spin_lock_init(&p->vbd_lock); } diff --git a/xen/include/asm-i386/processor.h b/xen/include/asm-i386/processor.h index 2968e2e4e9..c16d476016 100644 --- a/xen/include/asm-i386/processor.h +++ b/xen/include/asm-i386/processor.h @@ -449,7 +449,7 @@ struct mm_struct { struct shadow_status *shadow_ht; struct shadow_status *shadow_ht_free; struct shadow_status *shadow_ht_extras; /* extra allocation units */ - unsigned int *shadow_dirty_bitmap; + unsigned long *shadow_dirty_bitmap; unsigned int shadow_dirty_bitmap_size; /* in pages, bit per page */ unsigned int shadow_page_count; unsigned int shadow_max_page_count; diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h index 024e75ad83..0027e9df29 100644 --- a/xen/include/hypervisor-ifs/dom0_ops.h +++ b/xen/include/hypervisor-ifs/dom0_ops.h @@ -243,6 +243,9 @@ typedef struct dom0_shadow_control_st /* IN variables. */ domid_t domain; int op; + unsigned long *dirty_bitmap; // pointe to mlocked buffer + /* IN/OUT variables */ + unsigned long pages; // size of buffer, updated with actual size } dom0_shadow_control_t; #define DOM0_SETDOMAINNAME 26 diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h index 628d20c17e..c132ad9662 100644 --- a/xen/include/xen/mm.h +++ b/xen/include/xen/mm.h @@ -164,8 +164,8 @@ static inline int get_page(struct pfn_info *page, unlikely(x & PGC_zombie) || /* Zombie? */ unlikely(p != domain) ) /* Wrong owner? */ { - DPRINTK("Error pfn %08lx: ed=%p(%lld), sd=%p(%lld), caf=%08x\n", - page_to_pfn(page), domain, (domain)?domain->domain:1234, p, (p)?p->domain:1234, x); + DPRINTK("Error pfn %08lx: ed=%p(%lld), sd=%p(%lld), caf=%08x, taf=%08x\n", + page_to_pfn(page), domain, (domain)?domain->domain:999, p, (p && !((x & PGC_count_mask) == 0))?p->domain:999, x, page->type_and_flags); return 0; } __asm__ __volatile__( @@ -314,7 +314,7 @@ int check_descriptor(unsigned long a, unsigned long b); #define machine_to_phys_mapping ((unsigned long *)RDWR_MPT_VIRT_START) /* Part of the domain API. */ -int do_mmu_update(mmu_update_t *updates, int count); +int do_mmu_update(mmu_update_t *updates, int *count); #define DEFAULT_GDT_ENTRIES ((LAST_RESERVED_GDT_ENTRY*8)+7) #define DEFAULT_GDT_ADDRESS ((unsigned long)gdt_table) diff --git a/xen/include/xen/shadow.h b/xen/include/xen/shadow.h index fba6fe3dfd..01b46301aa 100644 --- a/xen/include/xen/shadow.h +++ b/xen/include/xen/shadow.h @@ -23,7 +23,7 @@ #define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START) #define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT)))) -extern int shadow_mode_control( struct task_struct *p, unsigned int op ); +extern int shadow_mode_control( struct task_struct *p, dom0_shadow_control_t *sc ); extern int shadow_fault( unsigned long va, long error_code ); extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, unsigned long *prev_spfn_ptr, @@ -50,7 +50,7 @@ struct shadow_status { #ifndef NDEBUG #define SH_LOG(_f, _a...) \ - printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \ + printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \ current->domain , __LINE__ , ## _a ) #else #define SH_LOG(_f, _a...) @@ -58,7 +58,7 @@ struct shadow_status { #if SHADOW_DEBUG #define SH_VLOG(_f, _a...) \ - printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \ + printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \ current->domain , __LINE__ , ## _a ) #else #define SH_VLOG(_f, _a...) @@ -66,19 +66,27 @@ struct shadow_status { #if 0 #define SH_VVLOG(_f, _a...) \ - printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \ + printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \ current->domain , __LINE__ , ## _a ) #else #define SH_VVLOG(_f, _a...) #endif - /************************************************************************/ static inline void mark_dirty( struct mm_struct *m, unsigned int mfn ) { - unsigned int pfn = machine_to_phys_mapping[mfn]; + unsigned int pfn; + + pfn = machine_to_phys_mapping[mfn]; + + /* We use values with the top bit set to mark MFNs that aren't + really part of the domain's psuedo-physical memory map e.g. + the shared info frame. Nothing to do here... + */ + if ( unlikely(pfn & 0x80000000U) ) return; + ASSERT(m->shadow_dirty_bitmap); if( likely(pfn<m->shadow_dirty_bitmap_size) ) { @@ -91,7 +99,14 @@ static inline void mark_dirty( struct mm_struct *m, unsigned int mfn ) } else { - SH_LOG("mark_dirty pfn out of range attempt!"); + extern void show_traceX(void); + SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)", + mfn, pfn, m->shadow_dirty_bitmap_size, m ); + SH_LOG("dom=%lld caf=%08x taf=%08x\n", + frame_table[mfn].u.domain->domain, + frame_table[mfn].count_and_flags, + frame_table[mfn].type_and_flags ); + //show_traceX(); } } @@ -116,7 +131,7 @@ static inline void l1pte_write_fault( struct mm_struct *m, spte = gpte; gpte |= _PAGE_DIRTY | _PAGE_ACCESSED; spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED; - mark_dirty( m, gpte >> PAGE_SHIFT ); + mark_dirty( m, (gpte >> PAGE_SHIFT) ); break; } @@ -343,7 +358,7 @@ static inline unsigned long get_shadow_status( struct mm_struct *m, if( m->shadow_mode == SHM_logdirty ) mark_dirty( m, gpfn ); - + spin_lock(&m->shadow_lock); res = __shadow_status( m, gpfn ); if (!res) spin_unlock(&m->shadow_lock); diff --git a/xen/net/dev.c b/xen/net/dev.c index 5ab01092f0..0252568131 100644 --- a/xen/net/dev.c +++ b/xen/net/dev.c @@ -547,6 +547,9 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif) goto out; } + machine_to_phys_mapping[new_page - frame_table] = + machine_to_phys_mapping[old_page - frame_table]; + if ( p->mm.shadow_mode && (spte_pfn=get_shadow_status(&p->mm, pte_page-frame_table)) ) { @@ -557,17 +560,15 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif) *sptr = new_pte; unmap_domain_mem(sptr); - if( p->mm.shadow_mode == SHM_logdirty ) - mark_dirty( &p->mm, new_page-frame_table ); - put_shadow_status(&p->mm); } - - machine_to_phys_mapping[new_page - frame_table] - = machine_to_phys_mapping[old_page - frame_table]; unmap_domain_mem(ptep); + /* if in shadow mode, mark the buffer as dirty */ + if( p->mm.shadow_mode == SHM_logdirty ) + mark_dirty( &p->mm, (new_page-frame_table) ); + /* Updates must happen before releasing the descriptor. */ smp_wmb(); @@ -2143,8 +2144,6 @@ static void get_rx_bufs(net_vif_t *vif) put_page_and_type(pte_page); make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0); goto rx_unmap_and_continue; - - /* XXX IAP should SHADOW_CONFIG do something here? */ } /* @@ -2156,9 +2155,11 @@ static void get_rx_bufs(net_vif_t *vif) 0) != (PGC_allocated | PGC_tlb_flush_on_type_change | 2)) ) { - DPRINTK("Page held more than once %08x %s\n", + DPRINTK("Page held more than once mfn=%x %08x %s\n", + buf_page-frame_table, buf_page->count_and_flags, (buf_page->u.domain)?buf_page->u.domain->name:"None"); + if ( !get_page_type(buf_page, PGT_writeable_page) ) put_page(buf_page); else if ( cmpxchg(ptep, pte & ~_PAGE_PRESENT, pte) != @@ -2264,6 +2265,13 @@ long flush_bufs_for_vif(net_vif_t *vif) put_page_and_type(&frame_table[rx->pte_ptr >> PAGE_SHIFT]); + /* if in shadow mode, mark the PTE as dirty */ + if( p->mm.shadow_mode == SHM_logdirty ) + mark_dirty( &p->mm, rx->pte_ptr>>PAGE_SHIFT ); + /* assume the shadow page table is about to be blown away, + and that its not worth marking the buffer as dirty */ + + make_rx_response(vif, rx->id, 0, RING_STATUS_DROPPED, 0); } vif->rx_cons = i; diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/block/block.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/block/block.c index d00dd98f7b..43a6a23479 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/block/block.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/block/block.c @@ -527,8 +527,6 @@ static void reset_xlblk_interface(void) { block_io_op_t op; - nr_pending = 0; - op.cmd = BLOCK_IO_OP_RESET; if ( HYPERVISOR_block_io_op(&op) != 0 ) printk(KERN_ALERT "Possible blkdev trouble: couldn't reset ring\n"); @@ -549,6 +547,8 @@ int __init xlblk_init(void) { int error; + nr_pending = 0; + reset_xlblk_interface(); xlblk_response_irq = bind_virq_to_irq(VIRQ_BLKDEV); diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c index ccda2c2022..2fc577061e 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c @@ -103,12 +103,12 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, if (msg[j].va + (msg[j].npages<<PAGE_SHIFT) > vma->vm_end) return -EINVAL; - if (rc = direct_remap_area_pages(vma->vm_mm, + if ( (rc = direct_remap_area_pages(vma->vm_mm, msg[j].va&PAGE_MASK, msg[j].mfn<<PAGE_SHIFT, msg[j].npages<<PAGE_SHIFT, vma->vm_page_prot, - mmapcmd.dom)) + mmapcmd.dom)) <0) return rc; } } @@ -116,6 +116,91 @@ static int privcmd_ioctl(struct inode *inode, struct file *file, } break; + case IOCTL_PRIVCMD_MMAPBATCH: + { +#define MAX_DIRECTMAP_MMU_QUEUE 130 + mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *w, *v; + privcmd_mmapbatch_t m; + struct vm_area_struct *vma = NULL; + unsigned long *p, addr; + unsigned long mfn; + int i; + + if ( copy_from_user(&m, (void *)data, sizeof(m)) ) + { ret = -EFAULT; goto batch_err; } + + vma = find_vma( current->mm, m.addr ); + + if (!vma) + { ret = -EINVAL; goto batch_err; } + + if (m.addr > PAGE_OFFSET) + { ret = -EFAULT; goto batch_err; } + + if (m.addr + (m.num<<PAGE_SHIFT) > vma->vm_end) + { ret = -EFAULT; goto batch_err; } + + // everything fits inside the vma + +//printk("direct_r_a_p sx=%ld address=%lx macaddr=%lx dom=%lld\n",size,address,machine_addr,domid); +// memset( u, 0, sizeof(mmu_update_t)*MAX_DIRECTMAP_MMU_QUEUE );// XXX + + + if ( m.dom != 0 ) + { + u[0].val = (unsigned long)(m.dom<<16) & ~0xFFFFUL; + u[0].ptr = (unsigned long)(m.dom<< 0) & ~0xFFFFUL; + u[1].val = (unsigned long)(m.dom>>16) & ~0xFFFFUL; + u[1].ptr = (unsigned long)(m.dom>>32) & ~0xFFFFUL; + u[0].ptr |= MMU_EXTENDED_COMMAND; + u[0].val |= MMUEXT_SET_SUBJECTDOM_L; + u[1].ptr |= MMU_EXTENDED_COMMAND; + u[1].val |= MMUEXT_SET_SUBJECTDOM_H; + v = w = &u[2]; + } + else + { + v = w = &u[0]; + } + + p = m.arr; + addr = m.addr; +//printk("BATCH: arr=%p addr=%lx num=%d u=%p,w=%p\n",p,addr,m.num,u,w); + for (i=0; i<m.num; i++, addr+=PAGE_SIZE, p++) + { + unsigned int count; + if ( get_user(mfn, p) ) return -EFAULT; + + v->val = (mfn << PAGE_SHIFT) | pgprot_val(vma->vm_page_prot) | + _PAGE_IO; + + __direct_remap_area_pages( vma->vm_mm, + addr, + PAGE_SIZE, + v); + v++; + count = v-u; +//printk("Q i=%d mfn=%x co=%d v=%p : %lx %lx\n",i,mfn,count,v, w->val,w->ptr); + + if ( HYPERVISOR_mmu_update(u, &count) < 0 ) + { + //printk("Fail %d->%d mfn=%lx\n",v-u,count, w->val); + put_user( 0xe0000000 | mfn, p ); + } + v=w; + } + ret = 0; + break; + + batch_err: + printk("batch_err ret=%d vma=%p addr=%lx num=%d arr=%lx %lx-%lx\n", + ret, vma, m.addr, m.num, m.arr, vma->vm_start, vma->vm_end); + break; + } + break; + + + default: ret = -EINVAL; break; diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c index ef54ff7fe9..daa8441d7b 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c @@ -248,6 +248,8 @@ static void network_alloc_rx_buffers(struct net_device *dev) np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = virt_to_machine(get_ppte(skb->head)); + /* Shadow optimisation: disown this page from p->m map */ + phys_to_machine_mapping[virt_to_phys(skb->head)>>PAGE_SHIFT] = 0x80000004; np->rx_bufs_to_notify++; } while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE ); @@ -364,6 +366,9 @@ static inline void _network_interrupt(struct net_device *dev) skb = np->rx_skbs[rx->id]; ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); + phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] = + (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT; + if ( unlikely(rx->status != RING_STATUS_OK) ) { /* Gate this error. We get a (valid) slew of them on suspend. */ @@ -382,9 +387,6 @@ static inline void _network_interrupt(struct net_device *dev) skb_shinfo(skb)->nr_frags = 0; skb_shinfo(skb)->frag_list = NULL; - phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] = - (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT; - skb->data = skb->tail = skb->head + rx->offset; skb_put(skb, rx->size); skb->protocol = eth_type_trans(skb, dev); diff --git a/xenolinux-2.4.26-sparse/arch/xen/kernel/setup.c b/xenolinux-2.4.26-sparse/arch/xen/kernel/setup.c index 6be85db7f1..b06c6c26b0 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/setup.c +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/setup.c @@ -1161,11 +1161,11 @@ static void stop_task(void *unused) virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT; suspend_record->nr_pfns = max_pfn; - j = 0; - for ( i = 0; i < max_pfn; i += (PAGE_SIZE / sizeof(unsigned long)) ) - pfn_to_mfn_frame_list[j++] = + for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) + { + pfn_to_mfn_frame_list[j] = virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; - + } /* * NB. This is /not/ a full dev_close() as that loses route information! * Instead we do essentialy the same as dev_close() but without notifying @@ -1207,7 +1207,9 @@ static void stop_task(void *unused) memcpy(&start_info, &suspend_record->resume_info, sizeof(start_info)); set_fixmap(FIX_SHARED_INFO, start_info.shared_info); + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + memset(empty_zero_page, 0, PAGE_SIZE); irq_resume(); diff --git a/xenolinux-2.4.26-sparse/arch/xen/kernel/time.c b/xenolinux-2.4.26-sparse/arch/xen/kernel/time.c index 52920cd0fc..3291a0338d 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/time.c +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/time.c @@ -62,6 +62,7 @@ #include <linux/smp.h> #include <linux/irq.h> #include <linux/sysctl.h> +#include <linux/sysrq.h> spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; extern rwlock_t xtime_lock; @@ -581,6 +582,10 @@ static void dbg_time_int(int irq, void *dev_id, struct pt_regs *ptregs) timer->expires,(u32)(t_st>>32), (u32)t_st); printk(KERN_ALERT "time: processed_system_time=0x%X:%08X\n", (u32)(processed_system_time>>32), (u32)processed_system_time); + + + handle_sysrq('t',NULL,NULL,NULL); + } static struct irqaction dbg_time = { diff --git a/xenolinux-2.4.26-sparse/arch/xen/kernel/traps.c b/xenolinux-2.4.26-sparse/arch/xen/kernel/traps.c index 0337cae1ca..78dbb9ef23 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/traps.c +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/traps.c @@ -317,16 +317,17 @@ asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) __asm__ __volatile__ ( "sldt %0" : "=r" (ldt) ); if ( ldt == 0 ) { - mmu_update_t u; - u.ptr = MMU_EXTENDED_COMMAND; - u.ptr |= (unsigned long)&default_ldt[0]; - u.val = MMUEXT_SET_LDT | (5 << MMUEXT_CMD_SHIFT); - if ( unlikely(HYPERVISOR_mmu_update(&u, 1) < 0) ) - { - show_trace(NULL); - panic("Failed to install default LDT"); - } - return; + int count = 1; + mmu_update_t u; + u.ptr = MMU_EXTENDED_COMMAND; + u.ptr |= (unsigned long)&default_ldt[0]; + u.val = MMUEXT_SET_LDT | (5 << MMUEXT_CMD_SHIFT); + if ( unlikely(HYPERVISOR_mmu_update(&u, &count) < 0) ) + { + show_trace(NULL); + panic("Failed to install default LDT"); + } + return; } } diff --git a/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c b/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c index c6dc710576..daa5ee1d73 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c +++ b/xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c @@ -37,12 +37,13 @@ static void DEBUG_allow_pt_reads(void) int i; for ( i = idx-1; i >= 0; i-- ) { + int count = 1; pte = update_debug_queue[i].ptep; if ( pte == NULL ) continue; update_debug_queue[i].ptep = NULL; update.ptr = virt_to_machine(pte); update.val = update_debug_queue[i].pteval; - HYPERVISOR_mmu_update(&update, 1); + HYPERVISOR_mmu_update(&update, &count); } } static void DEBUG_disallow_pt_read(unsigned long va) @@ -51,6 +52,7 @@ static void DEBUG_disallow_pt_read(unsigned long va) pmd_t *pmd; pgd_t *pgd; unsigned long pteval; + int count = 1; /* * We may fault because of an already outstanding update. * That's okay -- it'll get fixed up in the fault handler. @@ -62,7 +64,7 @@ static void DEBUG_disallow_pt_read(unsigned long va) update.ptr = virt_to_machine(pte); pteval = *(unsigned long *)pte; update.val = pteval & ~_PAGE_PRESENT; - HYPERVISOR_mmu_update(&update, 1); + HYPERVISOR_mmu_update(&update, &count); update_debug_queue[idx].ptep = pte; update_debug_queue[idx].pteval = pteval; } @@ -100,7 +102,7 @@ void MULTICALL_flush_page_update_queue(void) wmb(); /* Make sure index is cleared first to avoid double updates. */ queue_multicall2(__HYPERVISOR_mmu_update, (unsigned long)update_queue, - _idx); + &_idx); } spin_unlock_irqrestore(&update_lock, flags); } @@ -116,7 +118,7 @@ static inline void __flush_page_update_queue(void) #endif idx = 0; wmb(); /* Make sure index is cleared first to avoid double updates. */ - if ( unlikely(HYPERVISOR_mmu_update(update_queue, _idx) < 0) ) + if ( unlikely(HYPERVISOR_mmu_update(update_queue, &_idx) < 0) ) panic("Failed to execute MMU updates"); } diff --git a/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c b/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c index 28a0a4071a..dbe706bb95 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c +++ b/xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c @@ -27,16 +27,12 @@ #define direct_mk_pte_phys(physpage, pgprot) \ __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot) -static inline int direct_remap_area_pte(pte_t *pte, +static inline void direct_remap_area_pte(pte_t *pte, unsigned long address, unsigned long size, - unsigned long machine_addr, - pgprot_t prot, - domid_t domid) + mmu_update_t **v) { unsigned long end; -#define MAX_DIRECTMAP_MMU_QUEUE 130 - mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v, *w; address &= ~PMD_MASK; end = address + size; @@ -45,95 +41,55 @@ static inline int direct_remap_area_pte(pte_t *pte, if (address >= end) BUG(); - /* If not I/O mapping then specify General-Purpose Subject Domain (GPS). */ - if ( domid != 0 ) - { - u[0].val = (unsigned long)(domid<<16) & ~0xFFFFUL; - u[0].ptr = (unsigned long)(domid<< 0) & ~0xFFFFUL; - u[1].val = (unsigned long)(domid>>16) & ~0xFFFFUL; - u[1].ptr = (unsigned long)(domid>>32) & ~0xFFFFUL; - u[0].ptr |= MMU_EXTENDED_COMMAND; - u[0].val |= MMUEXT_SET_SUBJECTDOM_L; - u[1].ptr |= MMU_EXTENDED_COMMAND; - u[1].val |= MMUEXT_SET_SUBJECTDOM_H; - v = w = &u[2]; - } - else - { - v = w = &u[0]; - } - do { - if ( (v-u) == MAX_DIRECTMAP_MMU_QUEUE ) - { - if ( HYPERVISOR_mmu_update(u, MAX_DIRECTMAP_MMU_QUEUE) < 0 ) - return -EFAULT; - v = w; - } -#if 0 /* thanks to new ioctl mmaping interface this is no longer a bug */ +#if 0 // XXX if (!pte_none(*pte)) { printk("direct_remap_area_pte: page already exists\n"); BUG(); } #endif - v->ptr = virt_to_machine(pte); - v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot) | _PAGE_IO; - v++; + (*v)->ptr = virt_to_machine(pte); + (*v)++; address += PAGE_SIZE; - machine_addr += PAGE_SIZE; pte++; } while (address && (address < end)); - - if ( ((v-w) != 0) && (HYPERVISOR_mmu_update(u, v-u) < 0) ) - return -EFAULT; - - return 0; + return ; } static inline int direct_remap_area_pmd(struct mm_struct *mm, pmd_t *pmd, unsigned long address, unsigned long size, - unsigned long machine_addr, - pgprot_t prot, - domid_t domid) + mmu_update_t **v) { - int error = 0; unsigned long end; address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) end = PGDIR_SIZE; - machine_addr -= address; if (address >= end) BUG(); do { pte_t * pte = pte_alloc(mm, pmd, address); if (!pte) return -ENOMEM; - error = direct_remap_area_pte(pte, address, end - address, - address + machine_addr, prot, domid); - if ( error ) - break; + direct_remap_area_pte(pte, address, end - address, v); + address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); - return error; + return 0; } -int direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long machine_addr, - unsigned long size, - pgprot_t prot, - domid_t domid) +int __direct_remap_area_pages(struct mm_struct *mm, + unsigned long address, + unsigned long size, + mmu_update_t *v) { - int error = 0; pgd_t * dir; unsigned long end = address + size; - machine_addr -= address; dir = pgd_offset(mm, address); flush_cache_all(); if (address >= end) @@ -141,21 +97,89 @@ int direct_remap_area_pages(struct mm_struct *mm, spin_lock(&mm->page_table_lock); do { pmd_t *pmd = pmd_alloc(mm, dir, address); - error = -ENOMEM; if (!pmd) - break; - error = direct_remap_area_pmd(mm, pmd, address, end - address, - machine_addr + address, prot, domid); - if (error) - break; + return -ENOMEM; + direct_remap_area_pmd(mm, pmd, address, end - address, &v); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; + } while (address && (address < end)); spin_unlock(&mm->page_table_lock); flush_tlb_all(); - return error; + return 0; } + +int direct_remap_area_pages(struct mm_struct *mm, + unsigned long address, + unsigned long machine_addr, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + int i, count; + unsigned long start_address; +#define MAX_DIRECTMAP_MMU_QUEUE 130 + mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *w, *v; + + if ( domid != 0 ) + { + u[0].val = (unsigned long)(domid<<16) & ~0xFFFFUL; + u[0].ptr = (unsigned long)(domid<< 0) & ~0xFFFFUL; + u[1].val = (unsigned long)(domid>>16) & ~0xFFFFUL; + u[1].ptr = (unsigned long)(domid>>32) & ~0xFFFFUL; + u[0].ptr |= MMU_EXTENDED_COMMAND; + u[0].val |= MMUEXT_SET_SUBJECTDOM_L; + u[1].ptr |= MMU_EXTENDED_COMMAND; + u[1].val |= MMUEXT_SET_SUBJECTDOM_H; + v = w = &u[2]; + } + else + { + v = w = &u[0]; + } + + start_address = address; + + for(i=0; i<size; + i+=PAGE_SIZE, machine_addr+=PAGE_SIZE, address+=PAGE_SIZE, v++) + { + if( (v-u) == MAX_DIRECTMAP_MMU_QUEUE ) + { + /* get the ptep's filled in */ + __direct_remap_area_pages( mm, + start_address, + address-start_address, + w); + + count = v-u; + if ( HYPERVISOR_mmu_update(u, &count) < 0 ) + return -EFAULT; + v=w; + start_address = address; + } + + /* fill in the machine addresses */ + v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot) | _PAGE_IO; + } + + if(v!=w) + { + /* get the ptep's filled in */ + __direct_remap_area_pages( mm, + start_address, + address-start_address, + w); + count = v-u; + if ( HYPERVISOR_mmu_update(u, &count) < 0 ) + return -EFAULT; + + } + + return 0; +} + + #endif /* CONFIG_XEN_PRIVILEGED_GUEST */ diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h b/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h index c454728c0e..e8b2bc40b0 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h @@ -153,7 +153,7 @@ static inline int HYPERVISOR_set_trap_table(trap_info_t *table) return ret; } -static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int count) +static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int *count) { int ret; __asm__ __volatile__ ( diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h b/xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h index d853a3f2af..143beeeef5 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h @@ -276,4 +276,11 @@ extern int direct_remap_area_pages(struct mm_struct *mm, pgprot_t prot, domid_t domid); +extern int __direct_remap_area_pages(struct mm_struct *mm, + unsigned long address, + unsigned long size, + mmu_update_t *v); + + + #endif /* _I386_PGALLOC_H */ diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/proc_cmd.h b/xenolinux-2.4.26-sparse/include/asm-xen/proc_cmd.h index 3bf03c6064..08e452de15 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/proc_cmd.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/proc_cmd.h @@ -25,6 +25,13 @@ typedef struct privcmd_mmap { privcmd_mmap_entry_t *entry; } privcmd_mmap_t; +typedef struct privcmd_mmapbatch { + int num; // number of pages to populate + domid_t dom; // target domain + unsigned long addr; // virtual address + unsigned long *arr; // array of mfns - top nibble set on err +} privcmd_mmapbatch_t; + typedef struct privcmd_blkmsg { unsigned long op; @@ -50,5 +57,7 @@ typedef struct privcmd_blkmsg _IOC(_IOC_NONE, 'P', 1, 0) #define IOCTL_PRIVCMD_MMAP \ _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t)) +#define IOCTL_PRIVCMD_MMAPBATCH \ + _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmapbatch_t)) #endif /* __PROC_CMD_H__ */ |