aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/malta/be/config-default
blob: e278936daf495e82d8cdd5b36a059ccac1bc4b60 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
id='n425' href='#n425'>425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833
#include <xeno/config.h>
#include <xeno/init.h>
#include <xeno/lib.h>
#include <xeno/errno.h>
#include <xeno/sched.h>
#include <xeno/mm.h>
#include <xeno/skbuff.h>
#include <xeno/interrupt.h>
#include <xeno/delay.h>
#include <xeno/event.h>
#include <xeno/time.h>
#include <hypervisor-ifs/dom0_ops.h>
#include <asm/io.h>
#include <asm/domain_page.h>
#include <asm/flushtlb.h>
#include <asm/msr.h>
#include <xeno/blkdev.h>
#include <xeno/console.h>
#include <xeno/vbd.h>
#include <asm/i387.h>

/*
 * NB. No ring-3 access in initial guestOS pagetables. Note that we allow
 * ring-3 privileges in the page directories, so that the guestOS may later
 * decide to share a 4MB region with applications.
 */
#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)

/* Both these structures are protected by the tasklist_lock. */
rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;
struct task_struct *task_hash[TASK_HASH_SIZE];

struct task_struct *do_createdomain(domid_t dom_id, unsigned int cpu)
{
    int retval;
    char buf[100];
    struct task_struct *p = NULL;
    unsigned long flags;

    retval = -ENOMEM;
    p = alloc_task_struct();
    if ( p == NULL ) return NULL;
    memset(p, 0, sizeof(*p));

    atomic_set(&p->refcnt, 1);

    p->domain    = dom_id;
    p->processor = cpu;

    /* We use a large intermediate to avoid overflow in sprintf. */
    sprintf(buf, "Domain-%llu", dom_id);
    strncpy(p->name, buf, MAX_DOMAIN_NAME);
    p->name[MAX_DOMAIN_NAME-1] = '\0';

    spin_lock_init(&p->blk_ring_lock);
    spin_lock_init(&p->event_channel_lock);

    p->shared_info = (void *)get_free_page(GFP_KERNEL);
    memset(p->shared_info, 0, PAGE_SIZE);
    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), p);

    p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
    memset(p->mm.perdomain_pt, 0, PAGE_SIZE);

    init_blkdev_info(p);

    p->addr_limit = USER_DS;

    sched_add_domain(p);

    spin_lock_init(&p->page_list_lock);
    INIT_LIST_HEAD(&p->page_list);
    p->max_pages = p->tot_pages = 0;

    write_lock_irqsave(&tasklist_lock, flags);
    SET_LINKS(p);
    p->next_hash = task_hash[TASK_HASH(dom_id)];
    task_hash[TASK_HASH(dom_id)] = p;
    write_unlock_irqrestore(&tasklist_lock, flags);

    return p;
}


struct task_struct *find_domain_by_id(domid_t dom)
{
    struct task_struct *p;
    unsigned long flags;

    read_lock_irqsave(&tasklist_lock, flags);
    p = task_hash[TASK_HASH(dom)];
    while ( p != NULL )
    {
        if ( p->domain == dom )
        {
            get_task_struct(p);
            break;
        }
        p = p->next_hash;
    }
    read_unlock_irqrestore(&tasklist_lock, flags);

    return p;
}


void kill_domain_with_errmsg(const char *err)
{
    printk("DOM%llu FATAL ERROR: %s\n", current->domain, err);
    kill_domain();
}


void __kill_domain(struct task_struct *p)
{
    int i;
    struct task_struct **pp;
    unsigned long flags;

    if ( p->domain == 0 )
    {
        extern void machine_restart(char *);
        printk("Domain 0 killed: rebooting machine!\n");
        machine_restart(0);
    }

    /* Only allow the domain to be destroyed once. */
    if ( !sched_rem_domain(p) )
        return;

    printk("Killing domain %llu\n", p->domain);

    unlink_blkdev_info(p);

    for ( i = 0; i < MAX_DOMAIN_VIFS; i++ )
        unlink_net_vif(p->net_vif_list[i]);

    /*
     * Note this means that find_domain_by_id may fail, even when the caller
     * holds a reference to the domain being queried. Take care!
     */
    write_lock_irqsave(&tasklist_lock, flags);
    REMOVE_LINKS(p);
    pp = &task_hash[TASK_HASH(p->domain)];
    while ( *pp != p ) *pp = (*pp)->next_hash;
    *pp = p->next_hash;
    write_unlock_irqrestore(&tasklist_lock, flags);

    if ( p == current )
    {
        __enter_scheduler();
        BUG(); /* never get here */
    }
    else
    {
        put_task_struct(p);
    }
}


void kill_domain(void)
{
    __kill_domain(current);
}


long kill_other_domain(domid_t dom, int force)
{
    struct task_struct *p;
    unsigned long cpu_mask = 0;

    p = find_domain_by_id(dom);
    if ( p == NULL ) return -ESRCH;

    if ( p->state == TASK_STOPPED )
    {
        __kill_domain(p);
    }
    else if ( force )
    {
        cpu_mask = mark_hyp_event(p, _HYP_EVENT_DIE);
        hyp_event_notify(cpu_mask);
    }
    else
    {
        cpu_mask = mark_guest_event(p, _EVENT_DIE);
        guest_event_notify(cpu_mask);
    }

    put_task_struct(p);
    return 0;
}

void stop_domain(void)
{
    memcpy(&current->shared_info->execution_context, 
           get_execution_context(), 
           sizeof(execution_context_t));
    unlazy_fpu(current);
    wmb(); /* All CPUs must see saved info in state TASK_STOPPED. */
    set_current_state(TASK_STOPPED);
    __enter_scheduler();
}

long stop_other_domain(domid_t dom)
{
    unsigned long cpu_mask;
    struct task_struct *p;
    
    if ( dom == 0 )
        return -EINVAL;

    p = find_domain_by_id(dom);
    if ( p == NULL) return -ESRCH;
    
    if ( p->state != TASK_STOPPED )
    {
        cpu_mask = mark_guest_event(p, _EVENT_STOP);
        guest_event_notify(cpu_mask);
    }
    
    put_task_struct(p);
    return 0;
}

struct pfn_info *alloc_domain_page(struct task_struct *p)
{
    struct pfn_info *page = NULL;
    unsigned long flags, mask, pfn_stamp, cpu_stamp;
    int i;

    spin_lock_irqsave(&free_list_lock, flags);
    if ( likely(!list_empty(&free_list)) )
    {
        page = list_entry(free_list.next, struct pfn_info, list);
        list_del(&page->list);
        free_pfns--;
    }
    spin_unlock_irqrestore(&free_list_lock, flags);

    if ( unlikely(page == NULL) )
        return NULL;

    if ( (mask = page->u.cpu_mask) != 0 )
    {
        pfn_stamp = page->tlbflush_timestamp;
        for ( i = 0; (mask != 0) && (i < NR_CPUS); i++ )
        {
            if ( mask & (1<<i) )
            {
                cpu_stamp = tlbflush_time[i];
                if ( !NEED_FLUSH(cpu_stamp, pfn_stamp) )
                    mask &= ~(1<<i);
            }
        }

        if ( unlikely(mask != 0) )
        {
            /* In IRQ ctxt, flushing is best-effort only, to avoid deadlock. */
            if ( likely(!in_irq()) )
                flush_tlb_mask(mask);
            else if ( unlikely(!try_flush_tlb_mask(mask)) )
                goto free_and_exit;
            perfc_incrc(need_flush_tlb_flush);
        }
    }

    page->u.domain = p;
    page->type_and_flags = 0;
    if ( p != NULL )
    {
        if ( unlikely(in_irq()) )
            BUG();
        wmb(); /* Domain pointer must be visible before updating refcnt. */
        spin_lock(&p->page_list_lock);
        if ( unlikely(p->tot_pages >= p->max_pages) )
        {
            spin_unlock(&p->page_list_lock);
            goto free_and_exit;
        }
        list_add_tail(&page->list, &p->page_list);
        p->tot_pages++;
        page->count_and_flags = PGC_allocated | 1;
        spin_unlock(&p->page_list_lock);
    }

    return page;

 free_and_exit:
    spin_lock_irqsave(&free_list_lock, flags);
    list_add(&page->list, &free_list);
    free_pfns++;
    spin_unlock_irqrestore(&free_list_lock, flags);
    return NULL;
}

void free_domain_page(struct pfn_info *page)
{
    unsigned long flags;
    struct task_struct *p = page->u.domain;

    if ( unlikely(in_irq()) )
        BUG();

    if ( likely(!IS_XEN_HEAP_FRAME(page)) )
    {
        /*
         * No race with setting of zombie bit. If it wasn't set before the
         * last reference was dropped, then it can't be set now.
         */
        page->u.cpu_mask = 0;
        if ( !(page->count_and_flags & PGC_zombie) )
        {
            page->tlbflush_timestamp = tlbflush_clock;
            page->u.cpu_mask = 1 << p->processor;

            spin_lock(&p->page_list_lock);
            list_del(&page->list);
            p->tot_pages--;
            spin_unlock(&p->page_list_lock);
        }

        page->count_and_flags = 0;

        spin_lock_irqsave(&free_list_lock, flags);
        list_add(&page->list, &free_list);
        free_pfns++;
        spin_unlock_irqrestore(&free_list_lock, flags);
    }
    else
    {
        /*
         * No need for a TLB flush. Non-domain pages are always co-held by Xen,
         * and the Xen reference is not dropped until the domain is dead.
         * DOM0 may hold references, but it's trusted so no need to flush.
         */
        page->u.cpu_mask = 0;
        page->count_and_flags = 0;
        free_page((unsigned long)page_to_virt(page));
    }
}


void free_all_dom_mem(struct task_struct *p)
{
    struct list_head *ent, zombies;
    struct pfn_info *page;
    unsigned long x, y;

    INIT_LIST_HEAD(&zombies);

    /* STEP 1. Drop the in-use reference to the page-table base. */
    put_page_and_type(&frame_table[pagetable_val(p->mm.pagetable) >>
                                  PAGE_SHIFT]);

    /* STEP 2. Zombify all pages on the domain's allocation list. */
    spin_lock(&p->page_list_lock);
    while ( (ent = p->page_list.next) != &p->page_list )
    {
        page = list_entry(ent, struct pfn_info, list);

        if ( unlikely(!get_page(page, p)) )
        {
            /*
             * Another CPU has dropped the last reference and is responsible 
             * for removing the page from this list. Wait for them to do so.
             */
            spin_unlock(&p->page_list_lock);
            while ( p->page_list.next == ent )
                barrier();
            spin_lock(&p->page_list_lock);
            continue;
        }

        set_bit(_PGC_zombie, &page->count_and_flags);

        list_del(&page->list);
        p->tot_pages--;

        list_add(&page->list, &zombies);
    }
    spin_unlock(&p->page_list_lock);

    /*
     * STEP 3. With the domain's list lock now released, we examine each zombie
     * page and drop references for guest-allocated and/or type-pinned pages.
     */
    while ( (ent = zombies.next) != &zombies )
    {
        page = list_entry(ent, struct pfn_info, list);

        list_del(&page->list);
        
        if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) )
            put_page_and_type(page);

        if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
            put_page(page);

        /*
         * Forcibly invalidate L2 tables at this point to break circular
         * 'linear page table' references. This is okay because MMU structures
         * are not shared across domains and this domain is now dead. Thus L2
         * tables are not in use so a non-zero count means circular reference.
         */
        y = page->type_and_flags;
        do {
            x = y;
            if ( likely((x & (PGT_type_mask|PGT_validated)) != 
                        (PGT_l2_page_table|PGT_validated)) )
                break;
            y = cmpxchg(&page->type_and_flags, x, x & ~PGT_validated);
            if ( likely(y == x) )
                free_page_type(page, PGT_l2_page_table);
        }
        while ( unlikely(y != x) );

        put_page(page);
    }
}


unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
{
    unsigned int alloc_pfns, nr_pages;

    nr_pages = kbytes >> (PAGE_SHIFT - 10);

    /* TEMPORARY: max_pages should be explicitly specified. */
    p->max_pages = nr_pages;

    for ( alloc_pfns = 0; alloc_pfns < nr_pages; alloc_pfns++ )
    {
        if ( unlikely(alloc_domain_page(p) == NULL) ||
             unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 
                                   (PAGE_SHIFT-10))) )
        {
            free_all_dom_mem(p);
            return -1;
        }
    }

    p->tot_pages = nr_pages;

    return 0;
}
 

/* Release resources belonging to task @p. */
void release_task(struct task_struct *p)
{
    extern void destroy_event_channels(struct task_struct *);

    ASSERT(p->state == TASK_DYING);
    ASSERT(!p->has_cpu);

    printk("Releasing task %llu\n", p->domain);

    /*
     * This frees up blkdev rings and vbd-access lists. Totally safe since
     * blkdev ref counting actually uses the task_struct refcnt.
     */
    destroy_blkdev_info(p);

    /* Free all memory associated with this domain. */
    destroy_event_channels(p);
    free_page((unsigned long)p->mm.perdomain_pt);
    UNSHARE_PFN(virt_to_page(p->shared_info));
    free_all_dom_mem(p);

    kmem_cache_free(task_struct_cachep, p);
}


/*
 * final_setup_guestos is used for final setup and launching of domains other
 * than domain 0. ie. the domains that are being built by the userspace dom0
 * domain builder.
 */
int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain)
{
    unsigned long phys_l2tab;
    int i;

    if ( test_bit(PF_CONSTRUCTED, &p->flags) )
        return -EINVAL;
    
    clear_bit(PF_DONEFPUINIT, &p->flags);
    if ( builddomain->ctxt.flags & ECF_I387_VALID )
        set_bit(PF_DONEFPUINIT, &p->flags);
    memcpy(&p->shared_info->execution_context,
           &builddomain->ctxt.i386_ctxt,
           sizeof(p->shared_info->execution_context));
    memcpy(&p->thread.i387,
           &builddomain->ctxt.i387_ctxt,
           sizeof(p->thread.i387));
    memcpy(p->thread.traps,
           &builddomain->ctxt.trap_ctxt,
           sizeof(p->thread.traps));
    SET_DEFAULT_FAST_TRAP(&p->thread);
    (void)set_fast_trap(p, builddomain->ctxt.fast_trap_idx);
    p->mm.ldt_base = builddomain->ctxt.ldt_base;
    p->mm.ldt_ents = builddomain->ctxt.ldt_ents;
    SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES);
    SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);
    if ( builddomain->ctxt.gdt_ents != 0 )
        (void)set_gdt(p,
                      builddomain->ctxt.gdt_frames,
                      builddomain->ctxt.gdt_ents);
    p->thread.ss1  = builddomain->ctxt.ring1_ss;
    p->thread.esp1 = builddomain->ctxt.ring1_esp;
    for ( i = 0; i < 8; i++ )
        (void)set_debugreg(p, i, builddomain->ctxt.debugreg[i]);
    p->event_selector    = builddomain->ctxt.event_callback_cs;
    p->event_address     = builddomain->ctxt.event_callback_eip;
    p->failsafe_selector = builddomain->ctxt.failsafe_callback_cs;
    p->failsafe_address  = builddomain->ctxt.failsafe_callback_eip;
    
    phys_l2tab = builddomain->ctxt.pt_base;
    p->mm.pagetable = mk_pagetable(phys_l2tab);
    get_page_and_type(&frame_table[phys_l2tab>>PAGE_SHIFT], p, 
                      PGT_l2_page_table);

    /* Set up the shared info structure. */
    update_dom_time(p->shared_info);

    /* Add virtual network interfaces and point to them in startinfo. */
    while ( builddomain->num_vifs-- > 0 )
        (void)create_net_vif(p->domain);

    set_bit(PF_CONSTRUCTED, &p->flags);
    
    return 0;
}

static unsigned long alloc_page_from_domain(unsigned long * cur_addr, 
    unsigned long * index)
{
    unsigned long ret = *cur_addr;
    struct list_head *ent = frame_table[ret >> PAGE_SHIFT].list.prev;
    *cur_addr = list_entry(ent, struct pfn_info, list) - frame_table;
    *cur_addr <<= PAGE_SHIFT;
    (*index)--;    
    return ret;
}

/*
 * setup_guestos is used for building dom0 solely. other domains are built in
 * userspace dom0 and final setup is being done by final_setup_guestos.
 */
int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, 
                  unsigned int num_vifs,
                  char *phy_data_start, unsigned long data_len, 
		  char *cmdline, unsigned long initrd_len)
{
    struct list_head *list_ent;
    char *src, *vsrc, *dst, *data_start;
    int i;
    domid_t dom = p->domain;
    unsigned long phys_l1tab, phys_l2tab;
    unsigned long cur_address, alloc_address;
    unsigned long virt_load_address, virt_stack_address;
    start_info_t  *virt_startinfo_address;
    unsigned long count;
    unsigned long alloc_index;
    l2_pgentry_t *l2tab, *l2start;
    l1_pgentry_t *l1tab = NULL, *l1start = NULL;
    struct pfn_info *page = NULL;

    extern void ide_probe_devices(xen_disk_info_t *);
    extern void scsi_probe_devices(xen_disk_info_t *);
    xen_disk_info_t xdi;
    xen_disk_t *xd;

    /* Sanity! */
    if ( p->domain != 0 ) BUG();
    if ( test_bit(PF_CONSTRUCTED, &p->flags) ) BUG();

    /*
     * This is all a bit grim. We've moved the modules to the "safe" physical 
     * memory region above MAP_DIRECTMAP_ADDRESS (48MB). Later in this 
     * routeine, we're going to copy it down into the region that's actually 
     * been allocated to domain 0. This is highly likely to be overlapping, so 
     * we use a forward copy.
     * 
     * MAP_DIRECTMAP_ADDRESS should be safe. The worst case is a machine with 
     * 4GB and lots of network/disk cards that allocate loads of buffers. 
     * We'll have to revist this if we ever support PAE (64GB).
     */

    data_start = map_domain_mem((unsigned long)phy_data_start);

    if ( strncmp(data_start, "XenoGues", 8) )
    {
        printk("DOM%llu: Invalid guest OS image\n", dom);
        return -1;
    }

    virt_load_address = *(unsigned long *)(data_start + 8);
    if ( (virt_load_address & (PAGE_SIZE-1)) )
    {
        printk("DOM%llu: Guest OS load address not page-aligned (%08lx)\n",
               dom, virt_load_address);
        return -1;
    }

    if ( alloc_new_dom_mem(p, params->memory_kb) )
    {
        printk("DOM%llu: Not enough memory --- reduce dom0_mem ??\n", dom);
        return -ENOMEM;
    }

    alloc_address = list_entry(p->page_list.prev, struct pfn_info, list) -
        frame_table;
    alloc_address <<= PAGE_SHIFT;
    alloc_index = p->tot_pages;

    if ( data_len > (params->memory_kb << 9) )
    {
        printk("DOM%llu: Guest OS image is too large\n"
               "       (%luMB is greater than %uMB limit for a\n"
               "        %uMB address space)\n",
               dom, data_len>>20,
               (params->memory_kb)>>11,
               (params->memory_kb)>>10);
        free_all_dom_mem(p);
        return -1;
    }

    printk("DOM%llu: Guest OS virtual load address is %08lx\n", dom,
           virt_load_address);
    
    SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES);
    SET_GDT_ADDRESS(p, DEFAULT_GDT_ADDRESS);

    /*
     * We're basically forcing default RPLs to 1, so that our "what privilege
     * level are we returning to?" logic works.
     */
    p->failsafe_selector = FLAT_RING1_CS;
    p->event_selector    = FLAT_RING1_CS;
    p->thread.ss1        = FLAT_RING1_DS;
    for ( i = 0; i < 256; i++ ) 
        p->thread.traps[i].cs = FLAT_RING1_CS;

    /*
     * WARNING: The new domain must have its 'processor' field
     * filled in by now !!
     */
    phys_l2tab = alloc_page_from_domain(&alloc_address, &alloc_index);
    l2start = l2tab = map_domain_mem(phys_l2tab);
    memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
    l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
        mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR);
    l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
        mk_l2_pgentry(phys_l2tab | __PAGE_HYPERVISOR);
    memset(l2tab, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t));
    p->mm.pagetable = mk_pagetable(phys_l2tab);

    l2tab += l2_table_offset(virt_load_address);
    cur_address = list_entry(p->page_list.next, struct pfn_info, list) -
        frame_table;
    cur_address <<= PAGE_SHIFT;
    for ( count = 0; count < p->tot_pages; count++ )
    {
        if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
        {
            if ( l1tab != NULL ) unmap_domain_mem(l1start);
            phys_l1tab = alloc_page_from_domain(&alloc_address, &alloc_index);
            *l2tab++ = mk_l2_pgentry(phys_l1tab|L2_PROT);
            l1start = l1tab = map_domain_mem(phys_l1tab);
            clear_page(l1tab);
            l1tab += l1_table_offset(
                virt_load_address + (count << PAGE_SHIFT));
        }
        *l1tab++ = mk_l1_pgentry(cur_address|L1_PROT);
        
        page = &frame_table[cur_address >> PAGE_SHIFT];
        set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
        if ( !get_page_and_type(page, p, PGT_writeable_page) )
            BUG();
        /* Set up the MPT entry. */
        machine_to_phys_mapping[cur_address >> PAGE_SHIFT] = count;

        list_ent = frame_table[cur_address >> PAGE_SHIFT].list.next;
        cur_address = list_entry(list_ent, struct pfn_info, list) -
            frame_table;
        cur_address <<= PAGE_SHIFT;
    }
    unmap_domain_mem(l1start);

    /* pages that are part of page tables must be read only */
    l2tab = l2start + l2_table_offset(virt_load_address + 
        (alloc_index << PAGE_SHIFT));
    l1start = l1tab = map_domain_mem(l2_pgentry_to_phys(*l2tab));
    l1tab += l1_table_offset(virt_load_address + (alloc_index << PAGE_SHIFT));
    l2tab++;
    for ( count = alloc_index; count < p->tot_pages; count++ ) 
    {
        *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
        page = frame_table + l1_pgentry_to_pagenr(*l1tab);
        page->type_and_flags &= ~PGT_type_mask;
        page->type_and_flags |= PGT_l1_page_table;
        get_page(page, p); /* an extra ref because of readable mapping */
        l1tab++;
        if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
        {
            unmap_domain_mem(l1start);
            l1start = l1tab = map_domain_mem(l2_pgentry_to_phys(*l2tab));
            l2tab++;
        }
    }
    /* Rewrite last L1 page to be a L2 page. */
    page->type_and_flags &= ~PGT_type_mask;
    page->type_and_flags |= PGT_l2_page_table;
    /* Get another ref to L2 page so that it can be pinned. */
    if ( !get_page_and_type(page, p, PGT_l2_page_table) )
        BUG();
    set_bit(_PGC_guest_pinned, &page->count_and_flags);
    unmap_domain_mem(l1start);

    /* Set up shared info area. */
    update_dom_time(p->shared_info);
    p->shared_info->domain_time = 0;

    virt_startinfo_address = (start_info_t *)
        (virt_load_address + ((alloc_index - 1) << PAGE_SHIFT));
    virt_stack_address  = (unsigned long)virt_startinfo_address;
    
    unmap_domain_mem(l2start);

    /* Install the new page tables. */
    __cli();
    write_cr3_counted(pagetable_val(p->mm.pagetable));

    /* Copy the guest OS image. */    
    src  = (char *)(phy_data_start + 12);
    vsrc = (char *)(data_start + 12); /* data_start invalid after first page*/
    dst  = (char *)virt_load_address;
    while ( src < (phy_data_start+data_len) )
    {
	*dst++ = *vsrc++;
	src++;
	if ( (((unsigned long)src) & (PAGE_SIZE-1)) == 0 )
        {
	    unmap_domain_mem( vsrc-1 );
	    vsrc = map_domain_mem( (unsigned long)src );
        }
    }
    unmap_domain_mem( vsrc );
    
    /* Set up start info area. */
    memset(virt_startinfo_address, 0, sizeof(*virt_startinfo_address));
    virt_startinfo_address->nr_pages = p->tot_pages;
    virt_startinfo_address->shared_info = virt_to_phys(p->shared_info);
    virt_startinfo_address->pt_base = virt_load_address + 
        ((p->tot_pages - 1) << PAGE_SHIFT); 

    virt_startinfo_address->flags  = 0;
    if ( IS_PRIV(p) )
        virt_startinfo_address->flags |= SIF_PRIVILEGED;
    if ( p->domain == 0 )
        virt_startinfo_address->flags |= SIF_INITDOMAIN;

    if ( initrd_len )
    {
	virt_startinfo_address->mod_start = (unsigned long)dst-initrd_len;
	virt_startinfo_address->mod_len   = initrd_len;
	printk("Initrd len 0x%lx, start at 0x%08lx\n",
	       virt_startinfo_address->mod_len, 
               virt_startinfo_address->mod_start);
    }

    /* Add virtual network interfaces and point to them in startinfo. */
    while ( num_vifs-- > 0 )
        (void)create_net_vif(dom);

    dst = virt_startinfo_address->cmd_line;
    if ( cmdline != NULL )
    {
        for ( i = 0; i < 255; i++ )
        {
            if ( cmdline[i] == '\0' ) break;
            *dst++ = cmdline[i];
        }
    }
    *dst = '\0';

    /* HACK: Give up the VGA console iff the Xenolinux DOM0 wants it. */
    if ( strstr(cmdline, "tty0") != NULL )
        vgacon_enabled = 0;

    /* Reinstate the caller's page tables. */
    write_cr3_counted(pagetable_val(current->mm.pagetable));
    __sti();

    /* DOM0 gets access to all real block devices. */
#define MAX_REAL_DISKS 256
    xd = kmalloc(MAX_REAL_DISKS * sizeof(xen_disk_t), GFP_KERNEL);
    xdi.max   = MAX_REAL_DISKS;
    xdi.count = 0;
    xdi.disks = xd;
    ide_probe_devices(&xdi);
    scsi_probe_devices(&xdi);
    for ( i = 0; i < xdi.count; i++ )
    {
        xen_extent_t e;
        e.device       = xd[i].device;
        e.start_sector = 0;
        e.nr_sectors   = xd[i].capacity;
        if ( (__vbd_create(p, xd[i].device, VBD_MODE_R|VBD_MODE_W, 
                           xd[i].info) != 0) ||
             (__vbd_grow(p, xd[i].device, &e) != 0) )
            BUG();
    }
    kfree(xd);

    set_bit(PF_CONSTRUCTED, &p->flags);

    new_thread(p, 
               (unsigned long)virt_load_address, 
               (unsigned long)virt_stack_address, 
               (unsigned long)virt_startinfo_address);

    return 0;
}


void __init domain_init(void)
{
    printk("Initialising domains\n");
}