diff options
author | kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk> | 2004-05-07 14:53:28 +0000 |
---|---|---|
committer | kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk> | 2004-05-07 14:53:28 +0000 |
commit | 1de448f4c54eac94a966d65e72b15bcbef3a7e5d (patch) | |
tree | cc2a452db9507208580071b8676288d9b1dde5ea | |
parent | 747a8d04495070f12d625e2047b07eb3967ca9b8 (diff) | |
download | xen-1de448f4c54eac94a966d65e72b15bcbef3a7e5d.tar.gz xen-1de448f4c54eac94a966d65e72b15bcbef3a7e5d.tar.bz2 xen-1de448f4c54eac94a966d65e72b15bcbef3a7e5d.zip |
bitkeeper revision 1.891.1.5 (409ba2e8A6F60eP06BqyZUGapsn8XA)
Network interface for new IO model is now completed.
34 files changed, 1866 insertions, 349 deletions
@@ -107,6 +107,7 @@ 4048c0ddsF0WrU7HUzTvg1MJoCIfWg tools/xend/lib/domain_controller.h 4054a301VEag2GwrBrFBna5U1BGlLA tools/xend/lib/main.py 4055ad9ah9IuC3sJT2c_gYIFY5Tw_g tools/xend/lib/manager.py +409ba2e729HhE7fEra4B5EqX-F8Xzw tools/xend/lib/netif.py 40431ac8wrUEj-XM7B8smFtx_HA7lQ tools/xend/lib/utils.c 4054a2fdkdATEnRw-U7AUlgu-6JiUA tools/xend/setup.py 4056cd26Qyp09iNoOjrvzg8KYzSqOw tools/xend/xend @@ -735,6 +736,7 @@ 3f8707e7ZmZ6TxyX0ZUEfvhA2Pb_xQ xenolinux-2.4.26-sparse/include/asm-xen/msr.h 3e7270deQqtGPSnFxcW4AvJZuTUWfg xenolinux-2.4.26-sparse/include/asm-xen/multicall.h 3e5a4e67mnQfh-R8KcQCaVo2Oho6yg xenolinux-2.4.26-sparse/include/asm-xen/page.h +409ba2e7ZfV5hqTvIzxLtpClnxtIzg xenolinux-2.4.26-sparse/include/asm-xen/pci.h 3e5a4e67uTYU5oEnIDjxuaez8njjqg xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h 3e5a4e67X7JyupgdYkgDX19Huj2sAw xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h 3e5a4e67gr4NLGtQ5CvSLimMYZlkOA xenolinux-2.4.26-sparse/include/asm-xen/pgtable.h @@ -762,6 +764,7 @@ 3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.26-sparse/mm/memory.c 3f108af5VxPkLv13tXpXgoRKALQtXQ xenolinux-2.4.26-sparse/mm/mprotect.c 3e5a4e681xMPdF9xCMwpyfuYMySU5g xenolinux-2.4.26-sparse/mm/mremap.c +409ba2e7akOFqQUg6Qyg2s28xcXiMg xenolinux-2.4.26-sparse/mm/page_alloc.c 3e5a4e683HKVU-sxtagrDasRB8eBVw xenolinux-2.4.26-sparse/mm/swapfile.c 3f108af81Thhb242EmKjGCYkjx-GJA xenolinux-2.4.26-sparse/mm/vmalloc.c 407eb087XaNDLn8thVDLH-rI0hG-Xw xenolinux-sparse diff --git a/tools/examples/xc_dom_create.py b/tools/examples/xc_dom_create.py index 22479a9d14..19bb2ac9df 100755 --- a/tools/examples/xc_dom_create.py +++ b/tools/examples/xc_dom_create.py @@ -333,7 +333,18 @@ def make_domain(): xc.domain_destroy ( dom=id ) sys.exit() - if not new_io_world: + if new_io_world: + cmsg = 'new_network_interface(dom='+str(id)+')' + xend_response = xenctl.utils.xend_control_message(cmsg) + if not xend_response['success']: + print "Error creating network interface" + print "Error type: " + xend_response['error_type'] + if xend_response['error_type'] == 'exception': + print "Exception type: " + xend_response['exception_type'] + print "Exception val: " + xend_response['exception_value'] + xc.domain_destroy ( dom=id ) + sys.exit() + else: # setup virtual firewall rules for all aliases for ip in vfr_ipaddr: xenctl.utils.setup_vfr_rules_for_vif( id, 0, ip ) diff --git a/tools/xenctl/lib/utils.py b/tools/xenctl/lib/utils.py index 3f0914f73f..11aadb4f08 100644 --- a/tools/xenctl/lib/utils.py +++ b/tools/xenctl/lib/utils.py @@ -54,15 +54,13 @@ def get_current_ipmask(dev='eth0'): return m.group(1) return None -def get_current_ipgw(dev='eth0'): - """Return a string containing the IP gateway for the given - network interface (default 'eth0'). - """ +def get_current_ipgw(): + """Return a string containing the default IP gateway.""" fd = os.popen( '/sbin/route -n' ) lines = fd.readlines() for line in lines: - m = re.search( '^\S+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' + - '\s+\S+\s+\S*G.*' + dev + '.*', line ) + m = re.search( '^0.0.0.0+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' + + '\s+0.0.0.0+\s+\S*G.*', line ) if m: return m.group(1) return None diff --git a/tools/xend/lib/domain_controller.h b/tools/xend/lib/domain_controller.h index d5c397fe96..566967dc38 100644 --- a/tools/xend/lib/domain_controller.h +++ b/tools/xend/lib/domain_controller.h @@ -468,7 +468,6 @@ typedef struct { unsigned int evtchn; /* Event channel for notifications. */ unsigned long tx_shmem_frame; /* Page cont. tx shared comms window. */ unsigned long rx_shmem_frame; /* Page cont. rx shared comms window. */ - unsigned long shmem_frame; /* OUT */ unsigned int status; } netif_be_connect_t; diff --git a/tools/xend/lib/main.py b/tools/xend/lib/main.py index 7b5adbab83..0056783d7f 100755 --- a/tools/xend/lib/main.py +++ b/tools/xend/lib/main.py @@ -5,7 +5,7 @@ ########################################################### import errno, re, os, pwd, select, signal, socket, struct, sys, time -import xend.blkif, xend.console, xend.manager, xend.utils, Xc +import xend.blkif, xend.netif, xend.console, xend.manager, xend.utils, Xc # The following parameters could be placed in a configuration file. @@ -19,6 +19,8 @@ UNIX_SOCK = 'management_sock' # relative to CONTROL_DIR CMSG_CONSOLE = 0 CMSG_BLKIF_BE = 1 CMSG_BLKIF_FE = 2 +CMSG_NETIF_BE = 3 +CMSG_NETIF_FE = 4 def port_from_dom(dom): @@ -162,6 +164,10 @@ def daemon_loop(): if xend.blkif.interface.list.has_key(idx): blk_if = xend.blkif.interface.list[idx] + net_if = False + if xend.netif.interface.list.has_key(idx): + net_if = xend.netif.interface.list[idx] + # If we pick up a disconnect notification then we do any necessary # cleanup. if type == notifier.EXCEPTION: @@ -175,6 +181,9 @@ def daemon_loop(): if blk_if: blk_if.destroy() del blk_if + if net_if: + net_if.destroy() + del net_if continue # Process incoming requests. @@ -188,6 +197,10 @@ def daemon_loop(): blk_if.ctrlif_rx_req(port, msg) elif type == CMSG_BLKIF_BE and port == dom0_port: xend.blkif.backend_rx_req(port, msg) + elif type == CMSG_NETIF_FE and net_if: + net_if.ctrlif_rx_req(port, msg) + elif type == CMSG_NETIF_BE and port == dom0_port: + xend.netif.backend_rx_req(port, msg) else: port.write_response(msg) @@ -198,6 +211,8 @@ def daemon_loop(): type = (msg.get_header())['type'] if type == CMSG_BLKIF_BE and port == dom0_port: xend.blkif.backend_rx_rsp(port, msg) + elif type == CMSG_NETIF_BE and port == dom0_port: + xend.netif.backend_rx_rsp(port, msg) # Send console data. if con_if and con_if.ctrlif_transmit_work(port): @@ -207,10 +222,18 @@ def daemon_loop(): if blk_if and blk_if.ctrlif_transmit_work(port): work_done = True + # Send netif messages. + if net_if and net_if.ctrlif_transmit_work(port): + work_done = True + # Back-end block-device work. if port == dom0_port and xend.blkif.backend_do_work(port): work_done = True + # Back-end network-device work. + if port == dom0_port and xend.netif.backend_do_work(port): + work_done = True + # Finally, notify the remote end of any work that we did. if work_done: port.notify() diff --git a/tools/xend/lib/manager.py b/tools/xend/lib/manager.py index ea7398cd4c..2f15683d66 100644 --- a/tools/xend/lib/manager.py +++ b/tools/xend/lib/manager.py @@ -4,7 +4,7 @@ ## Copyright (c) 2004, K A Fraser (University of Cambridge) ############################################################# -import xend.blkif, xend.console, xend.main, xend.utils +import xend.blkif, xend.netif, xend.console, xend.main, xend.utils ## @@ -113,3 +113,40 @@ def new_block_device(dom, handle, vdev, pdev, start_sect, nr_sect, readonly): # Response is deferred until back-end driver sends acknowledgement. return None + + +## +## new_network_interface: +## Create a new network interface for the specified domain @dom. +## +def new_network_interface(dom, handle=-1): + # By default we create an interface with handle zero. + if handle < 0: + handle = 0 + + # We only support one interface per domain, which must have handle zero. + if handle != 0: + response = { 'success': False } + response['error_type'] = 'Bad handle %d (only handle 0 ' + \ + 'is supported)' % handle + return response + + # Find local event-channel port associated with the specified domain. + port = xend.main.port_from_dom(dom) + if not port: + response = { 'success': False } + response['error_type'] = 'Unknown domain %d' % dom + return response + + # The interface must not already exist. + if xend.netif.interface.list.has_key(port.local_port): + response = { 'success': False } + response['error_type'] = 'Interface (dom=%d,handle=%d) already ' + \ + 'exists' % (dom, handle) + return response + + # Create the new interface. Initially no virtual devices are attached. + xend.netif.interface(dom, port.local_port) + + # Response is deferred until back-end driver sends acknowledgement. + return None diff --git a/tools/xend/lib/netif.py b/tools/xend/lib/netif.py new file mode 100644 index 0000000000..11756c5e56 --- /dev/null +++ b/tools/xend/lib/netif.py @@ -0,0 +1,144 @@ + +################################################################### +## xend/netif.py -- Network-interface management functions for Xend +## Copyright (c) 2004, K A Fraser (University of Cambridge) +################################################################### + +import errno, random, re, os, select, signal, socket, struct, sys +import xend.main, xend.console, xend.manager, xend.utils, Xc + +CMSG_NETIF_BE = 3 +CMSG_NETIF_FE = 4 +CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED = 0 +CMSG_NETIF_FE_DRIVER_STATUS_CHANGED = 32 +CMSG_NETIF_FE_INTERFACE_CONNECT = 33 +CMSG_NETIF_FE_INTERFACE_DISCONNECT = 34 +CMSG_NETIF_BE_CREATE = 0 +CMSG_NETIF_BE_DESTROY = 1 +CMSG_NETIF_BE_CONNECT = 2 +CMSG_NETIF_BE_DISCONNECT = 3 + +pendmsg = None +pendaddr = None + +def backend_tx_req(msg): + port = xend.main.dom0_port + if port.space_to_write_request(): + port.write_request(msg) + port.notify() + else: + xend.netif.pendmsg = msg + +def backend_rx_req(port, msg): + port.write_response(msg) + +def backend_rx_rsp(port, msg): + subtype = (msg.get_header())['subtype'] + print "Received netif-be response, subtype %d" % subtype + if subtype == CMSG_NETIF_BE_CREATE: + rsp = { 'success': True } + xend.main.send_management_response(rsp, xend.netif.pendaddr) + elif subtype == CMSG_NETIF_BE_CONNECT: + (dom,hnd,evtchn,tx_frame,rx_frame,st) = \ + struct.unpack("QIILLI", msg.get_payload()) + netif = interface.list[xend.main.port_from_dom(dom).local_port] + msg = xend.utils.message(CMSG_NETIF_FE, \ + CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0) + msg.append_payload(struct.pack("IIIBBBBBBBB",0,2, \ + netif.evtchn['port2'], \ + netif.mac[0],netif.mac[1], \ + netif.mac[2],netif.mac[3], \ + netif.mac[4],netif.mac[5], \ + 0,0)) + netif.ctrlif_tx_req(xend.main.port_list[netif.key], msg) + +def backend_do_work(port): + global pendmsg + if pendmsg and port.space_to_write_request(): + port.write_request(pendmsg) + pendmsg = None + return True + return False + + +class interface: + + # Dictionary of all network-device interfaces. + list = {} + + + # NB. 'key' is an opaque value that has no meaning in this class. + def __init__(self, dom, key): + self.dom = dom + self.key = key + self.pendmsg = None + + # VIFs get a random MAC address with a "special" vendor id. + # + # NB. The vendor is currently an "obsolete" one that used to belong + # to DEC (AA-00-00). Using it is probably a bit rude :-) + # + # NB2. The first bit of the first random octet is set to zero for + # all dynamic MAC addresses. This may allow us to manually specify + # MAC addresses for some VIFs with no fear of clashes. + self.mac = [ 0xaa, 0x00, 0x00 ] + self.mac.append(int(random.random()*128)) + self.mac.append(int(random.random()*256)) + self.mac.append(int(random.random()*256)) + + interface.list[key] = self + msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_CREATE, 0) + msg.append_payload(struct.pack("QIBBBBBBBBI",dom,0, \ + self.mac[0],self.mac[1], \ + self.mac[2],self.mac[3], \ + self.mac[4],self.mac[5], \ + 0,0,0)) + xend.netif.pendaddr = xend.main.mgmt_req_addr + backend_tx_req(msg) + + + # Completely destroy this interface. + def destroy(self): + del interface.list[self.key] + msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_DESTROY, 0) + msg.append_payload(struct.pack("QII",self.dom,0,0)) + backend_tx_req(msg) + + + # The parameter @port is the control-interface event channel. This method + # returns True if messages were written to the control interface. + def ctrlif_transmit_work(self, port): + if self.pendmsg and port.space_to_write_request(): + port.write_request(self.pendmsg) + self.pendmsg = None + return True + return False + + def ctrlif_tx_req(self, port, msg): + if port.space_to_write_request(): + port.write_request(msg) + port.notify() + else: + self.pendmsg = msg + + def ctrlif_rx_req(self, port, msg): + port.write_response(msg) + subtype = (msg.get_header())['subtype'] + if subtype == CMSG_NETIF_FE_DRIVER_STATUS_CHANGED: + msg = xend.utils.message(CMSG_NETIF_FE, \ + CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0) + msg.append_payload(struct.pack("IIIBBBBBBBB",0,1,0,self.mac[0], \ + self.mac[1],self.mac[2], \ + self.mac[3],self.mac[4], \ + self.mac[5],0,0)) + self.ctrlif_tx_req(port, msg) + elif subtype == CMSG_NETIF_FE_INTERFACE_CONNECT: + (hnd,tx_frame,rx_frame) = struct.unpack("ILL", msg.get_payload()) + xc = Xc.new() + self.evtchn = xc.evtchn_bind_interdomain(dom1=0,dom2=self.dom) + msg = xend.utils.message(CMSG_NETIF_BE, \ + CMSG_NETIF_BE_CONNECT, 0) + msg.append_payload(struct.pack("QIILLI",self.dom,0, \ + self.evtchn['port1'],tx_frame, \ + rx_frame,0)) + backend_tx_req(msg) diff --git a/xen/common/dom_mem_ops.c b/xen/common/dom_mem_ops.c index 7d596026f9..79d0bb1df1 100644 --- a/xen/common/dom_mem_ops.c +++ b/xen/common/dom_mem_ops.c @@ -27,13 +27,21 @@ static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op) { /* Leave some slack pages; e.g., for the network. */ if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> - (PAGE_SHIFT-10))) ) + (PAGE_SHIFT-10))) ) + { + DPRINTK("Not enough slack: %u %u\n", + free_pfns, + SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10)); break; + } /* NB. 'alloc_domain_page' does limit checking on pages per domain. */ if ( unlikely((page = alloc_domain_page(p)) == NULL) ) + { + DPRINTK("Could not allocate a frame\n"); break; - + } + /* Inform the domain of the new page's machine address. */ mpfn = (unsigned long)(page - frame_table); copy_to_user(op.pages, &mpfn, sizeof(mpfn)); diff --git a/xen/common/domain.c b/xen/common/domain.c index a9c40ae98f..1b8759e912 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -334,6 +334,8 @@ struct pfn_info *alloc_domain_page(struct task_struct *p) spin_lock(&p->page_list_lock); if ( unlikely(p->tot_pages >= p->max_pages) ) { + DPRINTK("Over-allocation for domain %llu: %u >= %u\n", + p->domain, p->tot_pages, p->max_pages); spin_unlock(&p->page_list_lock); goto free_and_exit; } @@ -884,7 +886,7 @@ int construct_dom0(struct task_struct *p, page->type_and_flags = 0; page->count_and_flags = PGC_allocated | 1; list_add_tail(&page->list, &p->page_list); - p->tot_pages++; + p->tot_pages++; p->max_pages++; } mpt_alloc = (vpt_start - v_start) + alloc_start; diff --git a/xen/common/kernel.c b/xen/common/kernel.c index 7f814391cf..0d5fa023a1 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -105,7 +105,6 @@ static struct { void cmain(unsigned long magic, multiboot_info_t *mbi) { struct task_struct *new_dom; - dom0_createdomain_t dom0_params; unsigned long max_page; unsigned char *cmdline; module_t *mod = (module_t *)__va(mbi->mods_addr); @@ -263,7 +262,6 @@ void cmain(unsigned long magic, multiboot_info_t *mbi) task_hash[TASK_HASH(IDLE_DOMAIN_ID)] = &idle0_task; /* Create initial domain 0. */ - dom0_params.memory_kb = opt_dom0_mem; new_dom = do_createdomain(0, 0); if ( new_dom == NULL ) panic("Error creating domain 0\n"); diff --git a/xen/common/memory.c b/xen/common/memory.c index e4d0590a57..5acfae8482 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -940,17 +940,25 @@ static int do_extended_command(unsigned long ptr, unsigned long val) } break; + /* XXX This function is racey! */ case MMUEXT_REASSIGN_PAGE: - if ( !IS_PRIV(current) ) + if ( unlikely(!IS_PRIV(current)) ) { MEM_LOG("Dom %llu has no privilege to reassign page ownership", current->domain); okay = 0; } - else if ( percpu_info[cpu].gps != NULL ) + else if ( likely(percpu_info[cpu].gps != NULL) ) { + current->tot_pages--; + percpu_info[cpu].gps->tot_pages++; page->u.domain = percpu_info[cpu].gps; } + else + { + MEM_LOG("No GPS to reassign pfn %08lx to\n", pfn); + okay = 0; + } break; case MMUEXT_RESET_SUBJECTDOM: diff --git a/xenolinux-2.4.26-sparse/arch/xen/config.in b/xenolinux-2.4.26-sparse/arch/xen/config.in index 16fa5e66d4..7f961d8521 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/config.in +++ b/xenolinux-2.4.26-sparse/arch/xen/config.in @@ -101,6 +101,8 @@ if [ "$CONFIG_HIGHMEM" = "y" ]; then bool 'HIGHMEM I/O support' CONFIG_HIGHIO fi +define_int CONFIG_FORCE_MAX_ZONEORDER 12 + #bool 'Symmetric multi-processing support' CONFIG_SMP #if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then # define_bool CONFIG_HAVE_DEC_LOCK y diff --git a/xenolinux-2.4.26-sparse/arch/xen/defconfig b/xenolinux-2.4.26-sparse/arch/xen/defconfig index eaa9171b1f..013e732c3f 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/defconfig +++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig @@ -50,6 +50,7 @@ CONFIG_X86_TSC=y CONFIG_X86_L1_CACHE_SHIFT=5 CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set +CONFIG_FORCE_MAX_ZONEORDER=12 # # General setup @@ -156,6 +157,7 @@ CONFIG_IP_NF_TARGET_ULOG=y # Network testing # # CONFIG_NET_PKTGEN is not set +CONFIG_NETDEVICES=y # # Block devices diff --git a/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev b/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev index 41b05aaaa7..3be5b50bfa 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev +++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev @@ -51,6 +51,7 @@ CONFIG_X86_TSC=y CONFIG_X86_L1_CACHE_SHIFT=5 CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set +CONFIG_FORCE_MAX_ZONEORDER=12 # # General setup diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h index e6004b4a8e..e80435fbbb 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h @@ -10,6 +10,7 @@ #include <linux/rbtree.h> #include <linux/interrupt.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include <asm/ctrl_if.h> #include <asm/io.h> #include "../blkif.h" diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c index 0746ecfab0..0b26224651 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c @@ -74,7 +74,8 @@ void blkif_ctrlif_init(void) ctrl_msg_t cmsg; blkif_be_driver_status_changed_t st; - (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx); + (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); /* Send a driver-UP notification to the domain controller. */ cmsg.type = CMSG_BLKIF_BE; diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c index 9acbac35ab..14a6ab324d 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c @@ -70,7 +70,7 @@ void blkif_create(blkif_be_create_t *create) unsigned int handle = create->blkif_handle; blkif_t **pblkif, *blkif; - if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_ATOMIC)) == NULL ) + if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL ) { DPRINTK("Could not create blkif: out of memory\n"); create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c index 4b11ad9a8e..eb3e32c75f 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c @@ -24,17 +24,15 @@ #define MAX_PENDING_REQS 64 #define BATCH_PER_DOMAIN 16 -static struct vm_struct *mmap_vma; -#define MMAP_PAGES_PER_SEGMENT \ - ((BLKIF_MAX_SEGMENTS_PER_REQUEST >> (PAGE_SHIFT-9)) + 1) +static unsigned long mmap_vstart; #define MMAP_PAGES_PER_REQUEST \ - (2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * MMAP_PAGES_PER_SEGMENT) + (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) #define MMAP_PAGES \ (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) -#define MMAP_VADDR(_req,_seg) \ - ((unsigned long)mmap_vma->addr + \ +#define MMAP_VADDR(_req,_seg) \ + (mmap_vstart + \ ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ - ((_seg) * MMAP_PAGES_PER_SEGMENT * PAGE_SIZE)) + ((_seg) * PAGE_SIZE)) /* * Each outstanding request that we've passed to the lower device layers has a @@ -259,11 +257,13 @@ static void dispatch_probe(blkif_t *blkif, blkif_request_t *req) prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW); for ( i = 0; i < req->nr_segments; i++ ) { - if ( (req->buffer_and_sects[i] & ~PAGE_MASK) != (PAGE_SIZE / 512) ) + /* Make sure the buffer is page-sized. */ + if ( (blkif_first_sect(req->frame_and_sects[i]) != 0) || + (blkif_last_sect(req->frame_and_sects[i]) != 7) ) goto bad_descriptor; rc = direct_remap_area_pages(&init_mm, MMAP_VADDR(pending_idx, i), - req->buffer_and_sects[i] & PAGE_MASK, + req->frame_and_sects[i] & PAGE_MASK, PAGE_SIZE, prot, blkif->domid); if ( rc != 0 ) goto bad_descriptor; @@ -288,15 +288,15 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); struct buffer_head *bh; int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; - unsigned short nr_sects; - unsigned long buffer; + short nr_sects; + unsigned long buffer, fas; int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; pending_req_t *pending_req; pgprot_t prot; /* We map virtual scatter/gather segments to physical segments. */ int new_segs, nr_psegs = 0; - phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; + phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1]; /* Check that number of segments is sane. */ if ( unlikely(req->nr_segments == 0) || @@ -314,17 +314,12 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) */ for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects ) { - buffer = req->buffer_and_sects[i] & ~0x1FF; - nr_sects = req->buffer_and_sects[i] & 0x1FF; + fas = req->frame_and_sects[i]; + buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9); + nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1; - if ( unlikely(nr_sects == 0) ) - continue; - - if ( unlikely(nr_sects > BLKIF_MAX_SECTORS_PER_SEGMENT) ) - { - DPRINTK("Too many sectors in segment\n"); + if ( nr_sects <= 0 ) goto bad_descriptor; - } phys_seg[nr_psegs].dev = req->device; phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects; @@ -344,7 +339,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) } nr_psegs += new_segs; - ASSERT(nr_psegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST*2); + ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1)); } /* Nonsensical zero-sized request? */ @@ -358,13 +353,10 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) for ( i = 0; i < nr_psegs; i++ ) { - unsigned long sz = ((phys_seg[i].buffer & ~PAGE_MASK) + - (phys_seg[i].nr_sects << 9) + - (PAGE_SIZE - 1)) & PAGE_MASK; int rc = direct_remap_area_pages(&init_mm, MMAP_VADDR(pending_idx, i), phys_seg[i].buffer & PAGE_MASK, - sz, prot, blkif->domid); + PAGE_SIZE, prot, blkif->domid); if ( rc != 0 ) { DPRINTK("invalid buffer\n"); @@ -372,6 +364,8 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) MMAP_PAGES_PER_REQUEST * PAGE_SIZE); goto bad_descriptor; } + phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = + phys_seg[i].buffer >> PAGE_SHIFT; } pending_req = &pending_reqs[pending_idx]; @@ -399,6 +393,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) bh->b_rsector = (unsigned long)phys_seg[i].sector_number; bh->b_data = (char *)MMAP_VADDR(pending_idx, i) + (phys_seg[i].buffer & ~PAGE_MASK); +// bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i)); bh->b_end_io = end_block_io_op; bh->b_private = pending_req; @@ -456,13 +451,13 @@ static int __init init_module(void) { int i; + if ( !(start_info.flags & SIF_INITDOMAIN) ) + return 0; + blkif_interface_init(); - if ( (mmap_vma = get_vm_area(MMAP_PAGES * PAGE_SIZE, VM_IOREMAP)) == NULL ) - { - printk(KERN_WARNING "Could not allocate VMA for blkif backend.\n"); - return -ENOMEM; - } + if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 ) + BUG(); pending_cons = 0; pending_prod = MAX_PENDING_REQS; @@ -484,6 +479,7 @@ static int __init init_module(void) static void cleanup_module(void) { + BUG(); } module_init(init_module); diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c index 19b0b3015d..bb5b6ea743 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c @@ -47,7 +47,7 @@ void vbd_create(blkif_be_vbd_create_t *create) } } - if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_ATOMIC)) == NULL) ) + if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) ) { DPRINTK("vbd_create: out of memory\n"); create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; @@ -111,7 +111,7 @@ void vbd_grow(blkif_be_vbd_grow_t *grow) } if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t), - GFP_ATOMIC)) == NULL) ) + GFP_KERNEL)) == NULL) ) { DPRINTK("vbd_grow: out of memory\n"); grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h index 1938f68f8e..0a90744c59 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h @@ -26,19 +26,22 @@ */ #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 -#define BLKIF_MAX_SECTORS_PER_SEGMENT 16 - typedef struct { u8 operation; /* BLKIF_OP_??? */ u8 nr_segments; /* number of segments */ blkif_vdev_t device; /* only for read/write requests */ unsigned long id; /* private guest value, echoed in resp */ blkif_sector_t sector_number; /* start sector idx on disk (r/w only) */ - /* Least 9 bits is 'nr_sects'. High 23 bits is the address. */ - /* We must have '0 <= nr_sects <= BLKIF_MAX_SECTORS_PER_SEGMENT'. */ - unsigned long buffer_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + /* @f_a_s[2:0]=last_sect ; @f_a_s[5:3]=first_sect ; @f_a_s[:12]=frame. */ + /* @first_sect: first sector in frame to transfer (inclusive). */ + /* @last_sect: last sector in frame to transfer (inclusive). */ + /* @frame: machine page frame number. */ + unsigned long frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST]; } blkif_request_t; +#define blkif_first_sect(_fas) (((_fas)>>3)&7) +#define blkif_last_sect(_fas) ((_fas)&7) + typedef struct { unsigned long id; /* copied from request */ u8 operation; /* copied from request */ @@ -79,8 +82,8 @@ typedef struct { * @device == unused (zero) * @id == any value (echoed in response message) * @sector_num == unused (zero) - * @buffer_and_sects == list of page-aligned, page-sized buffers. - * (i.e., nr_sects == 8). + * @frame_and_sects == list of page-sized buffers. + * (i.e., @first_sect == 0, @last_sect == 7). * * The response is a list of vdisk_t elements copied into the out-of-band * probe buffer. On success the response status field contains the number diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c index 29cc01d087..63f1aeea26 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c @@ -24,8 +24,6 @@ typedef unsigned char byte; /* from linux/ide.h */ static unsigned int blkif_state = BLKIF_STATE_CLOSED; static unsigned int blkif_evtchn, blkif_irq; -static struct tq_struct blkif_statechange_tq; - static int blkif_control_rsp_valid; static blkif_response_t blkif_control_rsp; @@ -302,11 +300,18 @@ static int blkif_queue_request(unsigned long id, struct gendisk *gd; blkif_request_t *req; struct buffer_head *bh; + unsigned int fsect, lsect; - if ( unlikely(nr_sectors >= (1<<9)) ) - BUG(); + fsect = (buffer_ma & ~PAGE_MASK) >> 9; + lsect = fsect + nr_sectors - 1; + + /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */ if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) ) BUG(); + if ( lsect > 7 ) + BUG(); + + buffer_ma &= PAGE_MASK; if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) ) return 1; @@ -341,8 +346,9 @@ static int blkif_queue_request(unsigned long id, bh = (struct buffer_head *)id; bh->b_reqnext = (struct buffer_head *)req->id; req->id = id; - req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors; - if ( ++req->nr_segments < MAX_BLK_SEGS ) + req->frame_and_sects[req->nr_segments] = + buffer_ma | (fsect<<3) | lsect; + if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST ) sg_next_sect += nr_sectors; else DISABLE_SCATTERGATHER(); @@ -371,7 +377,7 @@ static int blkif_queue_request(unsigned long id, req->sector_number = (blkif_sector_t)sector_number; req->device = device; req->nr_segments = 1; - req->buffer_and_sects[0] = buffer_ma | nr_sectors; + req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect; req_prod++; return 0; @@ -556,46 +562,11 @@ void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp) } -static void blkif_bringup_phase1(void *unused) +static void blkif_status_change(blkif_fe_interface_status_changed_t *status) { ctrl_msg_t cmsg; blkif_fe_interface_connect_t up; - /* Move from CLOSED to DISCONNECTED state. */ - blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL); - blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; - blkif_state = BLKIF_STATE_DISCONNECTED; - - /* Construct an interface-CONNECT message for the domain controller. */ - cmsg.type = CMSG_BLKIF_FE; - cmsg.subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT; - cmsg.length = sizeof(blkif_fe_interface_connect_t); - up.handle = 0; - up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT; - memcpy(cmsg.msg, &up, sizeof(up)); - - /* Tell the controller to bring up the interface. */ - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -static void blkif_bringup_phase2(void *unused) -{ - blkif_irq = bind_evtchn_to_irq(blkif_evtchn); - (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL); - - /* Probe for discs that are attached to the interface. */ - xlvbd_init(); - - blkif_state = BLKIF_STATE_CONNECTED; - - /* Kick pending requests. */ - spin_lock_irq(&io_request_lock); - kick_pending_request_queues(); - spin_unlock_irq(&io_request_lock); -} - -static void blkif_status_change(blkif_fe_interface_status_changed_t *status) -{ if ( status->handle != 0 ) { printk(KERN_WARNING "Status change on unsupported blkif %d\n", @@ -617,8 +588,22 @@ static void blkif_status_change(blkif_fe_interface_status_changed_t *status) " in state %d\n", blkif_state); break; } - blkif_statechange_tq.routine = blkif_bringup_phase1; - schedule_task(&blkif_statechange_tq); + + /* Move from CLOSED to DISCONNECTED state. */ + blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL); + blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0; + blkif_state = BLKIF_STATE_DISCONNECTED; + + /* Construct an interface-CONNECT message for the domain controller. */ + cmsg.type = CMSG_BLKIF_FE; + cmsg.subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT; + cmsg.length = sizeof(blkif_fe_interface_connect_t); + up.handle = 0; + up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT; + memcpy(cmsg.msg, &up, sizeof(up)); + + /* Tell the controller to bring up the interface. */ + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); break; case BLKIF_INTERFACE_STATUS_CONNECTED: @@ -628,9 +613,20 @@ static void blkif_status_change(blkif_fe_interface_status_changed_t *status) " in state %d\n", blkif_state); break; } + blkif_evtchn = status->evtchn; - blkif_statechange_tq.routine = blkif_bringup_phase2; - schedule_task(&blkif_statechange_tq); + blkif_irq = bind_evtchn_to_irq(blkif_evtchn); + (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL); + + /* Probe for discs that are attached to the interface. */ + xlvbd_init(); + + blkif_state = BLKIF_STATE_CONNECTED; + + /* Kick pending requests. */ + spin_lock_irq(&io_request_lock); + kick_pending_request_queues(); + spin_unlock_irq(&io_request_lock); break; default: @@ -675,7 +671,11 @@ int __init xlblk_init(void) ctrl_msg_t cmsg; blkif_fe_driver_status_changed_t st; - (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx); + if ( start_info.flags & SIF_INITDOMAIN ) + return 0; + + (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); /* Send a driver-UP notification to the domain controller. */ cmsg.type = CMSG_BLKIF_FE; diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c index b26907192a..12ce976cb5 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c @@ -67,7 +67,7 @@ static int xlvbd_get_vbd_info(vdisk_t *disk_info) memset(&req, 0, sizeof(req)); req.operation = BLKIF_OP_PROBE; req.nr_segments = 1; - req.buffer_and_sects[0] = virt_to_machine(buf) | (PAGE_SIZE/512); + req.frame_and_sects[0] = virt_to_machine(buf) | 7; blkif_control_send(&req, &rsp); diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c index e01896385b..244f309467 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c @@ -513,7 +513,7 @@ static int __init xencons_init(void) } else { - (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx); + (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0); } printk("Xen virtual console successfully installed\n"); diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c index e0e43ff2cc..cf1b075031 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c @@ -10,8 +10,6 @@ static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) { - DPRINTK("Received netif backend message, subtype=%d\n", msg->subtype); - switch ( msg->subtype ) { case CMSG_NETIF_BE_CREATE: @@ -54,7 +52,8 @@ void netif_ctrlif_init(void) ctrl_msg_t cmsg; netif_be_driver_status_changed_t st; - (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx); + (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); /* Send a driver-UP notification to the domain controller. */ cmsg.type = CMSG_NETIF_BE; diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c index 8623d8214b..b6a9cff692 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c @@ -7,6 +7,7 @@ */ #include "common.h" +#include <linux/rtnetlink.h> #define NETIF_HASHSZ 1024 #define NETIF_HASH(_d,_h) \ @@ -14,6 +15,7 @@ static netif_t *netif_hash[NETIF_HASHSZ]; static struct net_device *bridge_dev; +static struct net_bridge *bridge_br; netif_t *netif_find_by_handle(domid_t domid, unsigned int handle) { @@ -36,8 +38,10 @@ void __netif_disconnect_complete(netif_t *netif) */ unbind_evtchn_from_irq(netif->evtchn); vfree(netif->tx); /* Frees netif->rx as well. */ - (void)br_del_if((struct net_bridge *)bridge_dev->priv, netif->dev); + rtnl_lock(); + (void)br_del_if(bridge_br, netif->dev); (void)dev_close(netif->dev); + rtnl_unlock(); /* Construct the deferred response message. */ cmsg.type = CMSG_NETIF_BE; @@ -73,7 +77,7 @@ void netif_create(netif_be_create_t *create) struct net_device *dev; netif_t **pnetif, *netif; - dev = alloc_netdev(sizeof(netif_t), "netif-be-%d", ether_setup); + dev = alloc_netdev(sizeof(netif_t), "nbe-if%d", ether_setup); if ( dev == NULL ) { DPRINTK("Could not create netif: out of memory\n"); @@ -111,7 +115,10 @@ void netif_create(netif_be_create_t *create) dev->hard_start_xmit = netif_be_start_xmit; dev->get_stats = netif_be_get_stats; memcpy(dev->dev_addr, create->mac, ETH_ALEN); - + + /* XXX In bridge mode we should force a different MAC from remote end. */ + dev->dev_addr[2] ^= 1; + if ( register_netdev(dev) != 0 ) { DPRINTK("Could not register new net device\n"); @@ -225,15 +232,27 @@ void netif_connect(netif_be_connect_t *connect) netif->status = CONNECTED; netif_get(netif); + rtnl_lock(); + (void)dev_open(netif->dev); - (void)br_add_if((struct net_bridge *)bridge_dev->priv, netif->dev); - /* At this point we try to ensure that eth0 is attached to the bridge. */ + (void)br_add_if(bridge_br, netif->dev); + + /* + * The default config is a very simple binding to eth0. + * If eth0 is being used as an IP interface by this OS then someone + * must add eth0's IP address to nbe-br, and change the routing table + * to refer to nbe-br instead of eth0. + */ + (void)dev_open(bridge_dev); if ( (eth0_dev = __dev_get_by_name("eth0")) != NULL ) { (void)dev_open(eth0_dev); - (void)br_add_if((struct net_bridge *)bridge_dev->priv, eth0_dev); + (void)br_add_if(bridge_br, eth0_dev); } - (void)request_irq(netif->irq, netif_be_int, 0, "netif-backend", netif); + + rtnl_unlock(); + + (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif); netif_start_queue(netif->dev); connect->status = NETIF_BE_STATUS_OKAY; @@ -271,8 +290,11 @@ int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id) void netif_interface_init(void) { memset(netif_hash, 0, sizeof(netif_hash)); - if ( br_add_bridge("netif-backend") != 0 ) + if ( br_add_bridge("nbe-br") != 0 ) BUG(); - bridge_dev = __dev_get_by_name("netif-be-bridge"); - (void)dev_open(bridge_dev); + bridge_dev = __dev_get_by_name("nbe-br"); + bridge_br = (struct net_bridge *)bridge_dev->priv; + bridge_br->bridge_hello_time = bridge_br->hello_time = 0; + bridge_br->bridge_forward_delay = bridge_br->forward_delay = 0; + bridge_br->stp_enabled = 0; } diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c index 5b84eba9bc..62a4adf27d 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c @@ -14,7 +14,7 @@ #include <asm/hypervisor-ifs/dom_mem_ops.h> static void net_tx_action(unsigned long unused); -static void tx_skb_release(struct sk_buff *skb); +static void netif_page_release(struct page *page); static void make_tx_response(netif_t *netif, u16 id, s8 st); @@ -30,13 +30,13 @@ static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0); #define tx_work_exists(_if) (1) #define MAX_PENDING_REQS 256 -unsigned long mmap_vstart; +static unsigned long mmap_vstart; #define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) #define PKT_PROT_LEN (ETH_HLEN + 20) -/*static pending_req_t pending_reqs[MAX_PENDING_REQS];*/ static u16 pending_id[MAX_PENDING_REQS]; +static netif_t *pending_netif[MAX_PENDING_REQS]; static u16 pending_ring[MAX_PENDING_REQS]; static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; typedef unsigned int PEND_RING_IDX; @@ -60,8 +60,7 @@ static void __refresh_mfn_list(void) op.u.increase.pages = mfn_list; if ( (ret = HYPERVISOR_dom_mem_op(&op)) != MAX_MFN_ALLOC ) { - printk(KERN_WARNING "Unable to increase memory reservation (%d)\n", - ret); + printk(KERN_ALERT "Unable to increase memory reservation (%d)\n", ret); BUG(); } alloc_index = MAX_MFN_ALLOC; @@ -100,10 +99,10 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) { netif_t *netif = (netif_t *)dev->priv; s8 status = NETIF_RSP_OKAY; - u16 size, id; + u16 size=0, id; mmu_update_t mmu[6]; pgd_t *pgd; pmd_t *pmd; pte_t *pte; - unsigned long vdata, new_mfn; + unsigned long vdata, mdata=0, new_mfn; /* Drop the packet if the target domain has no receive buffers. */ if ( (netif->rx_req_cons == netif->rx->req_prod) || @@ -126,16 +125,23 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) (((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) || ((skb->end - skb->head) < (PAGE_SIZE/2)) ) { - struct sk_buff *nskb = dev_alloc_skb(PAGE_SIZE-1024); + struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC); int hlen = skb->data - skb->head; + if ( unlikely(nskb == NULL) ) + { + DPRINTK("DOM%llu couldn't get memory for skb.\n", netif->domid); + status = NETIF_RSP_ERROR; + goto out; + } skb_reserve(nskb, hlen); - skb_put(nskb, skb->len); + __skb_put(nskb, skb->len); (void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len); dev_kfree_skb(skb); skb = nskb; } vdata = (unsigned long)skb->data; + mdata = virt_to_machine(vdata); size = skb->tail - skb->data; new_mfn = get_new_mfn(); @@ -153,7 +159,7 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) mmu[1].ptr |= MMU_EXTENDED_COMMAND; mmu[1].val |= MMUEXT_SET_SUBJECTDOM_H; - mmu[2].ptr = virt_to_machine(vdata & PAGE_MASK) | MMU_EXTENDED_COMMAND; + mmu[2].ptr = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND; mmu[2].val = MMUEXT_REASSIGN_PAGE; mmu[3].ptr = MMU_EXTENDED_COMMAND; @@ -167,6 +173,7 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) if ( unlikely(HYPERVISOR_mmu_update(mmu, 6) < 0) ) { + DPRINTK("Failed MMU update transferring to DOM%llu\n", netif->domid); dealloc_mfn(new_mfn); status = NETIF_RSP_ERROR; goto out; @@ -174,12 +181,12 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) phys_to_machine_mapping[__pa(vdata) >> PAGE_SHIFT] = new_mfn; - netif->stats.tx_bytes += size; - netif->stats.tx_packets++; + netif->stats.rx_bytes += size; + netif->stats.rx_packets++; out: spin_lock(&netif->rx_lock); - make_rx_response(netif, id, status, virt_to_machine(vdata), size); + make_rx_response(netif, id, status, mdata, size); spin_unlock(&netif->rx_lock); dev_kfree_skb(skb); return 0; @@ -220,6 +227,16 @@ static void add_to_net_schedule_list_tail(netif_t *netif) spin_unlock(&net_schedule_list_lock); } +static inline void netif_schedule_work(netif_t *netif) +{ + if ( (netif->tx_req_cons != netif->tx->req_prod) && + ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) + { + add_to_net_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } +} + void netif_deschedule(netif_t *netif) { remove_from_net_schedule_list(netif); @@ -229,14 +246,8 @@ void netif_deschedule(netif_t *netif) static void tx_credit_callback(unsigned long data) { netif_t *netif = (netif_t *)data; - netif->remaining_credit = netif->credit_bytes; - - if ( tx_work_exists(netif) ) - { - add_to_net_schedule_list_tail(netif); - maybe_schedule_tx_action(); - } + netif_schedule_work(netif); } #endif @@ -249,6 +260,7 @@ static void net_tx_action(unsigned long unused) u16 pending_idx; NETIF_RING_IDX i; pgprot_t prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED); + struct page *page; while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && !list_empty(&net_schedule_list) ) @@ -261,7 +273,7 @@ static void net_tx_action(unsigned long unused) /* Work to do? */ i = netif->tx_req_cons; - if ( (i == netif->tx->req_prod) && + if ( (i == netif->tx->req_prod) || ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) ) { netif_put(netif); @@ -296,7 +308,7 @@ static void net_tx_action(unsigned long unused) netif->remaining_credit -= tx.size; #endif - add_to_net_schedule_list_tail(netif); + netif_schedule_work(netif); if ( unlikely(txreq.size <= PKT_PROT_LEN) || unlikely(txreq.size > ETH_FRAME_LEN) ) @@ -335,6 +347,7 @@ static void net_tx_action(unsigned long unused) if ( unlikely((skb = alloc_skb(PKT_PROT_LEN, GFP_ATOMIC)) == NULL) ) { + DPRINTK("Can't allocate a skb in start_xmit.\n"); make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); netif_put(netif); vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE); @@ -346,29 +359,29 @@ static void net_tx_action(unsigned long unused) (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)), PKT_PROT_LEN); - skb->dev = netif->dev; - skb->protocol = eth_type_trans(skb, skb->dev); - + page = virt_to_page(MMAP_VADDR(pending_idx)); + /* Append the packet payload as a fragment. */ - skb_shinfo(skb)->frags[0].page = - virt_to_page(MMAP_VADDR(pending_idx)); - skb_shinfo(skb)->frags[0].size = - txreq.size - PKT_PROT_LEN; + skb_shinfo(skb)->frags[0].page = page; + skb_shinfo(skb)->frags[0].size = txreq.size - PKT_PROT_LEN; skb_shinfo(skb)->frags[0].page_offset = (txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK; skb_shinfo(skb)->nr_frags = 1; skb->data_len = txreq.size - PKT_PROT_LEN; skb->len += skb->data_len; + skb->dev = netif->dev; + skb->protocol = eth_type_trans(skb, skb->dev); + /* Destructor information. */ - skb->destructor = tx_skb_release; - skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page = (struct page *)netif; - skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size = pending_idx; + atomic_set(&page->count, 1); + page->mapping = (struct address_space *)netif_page_release; + pending_id[pending_idx] = txreq.id; + pending_netif[pending_idx] = netif; - netif->stats.rx_bytes += txreq.size; - netif->stats.rx_packets++; + netif->stats.tx_bytes += txreq.size; + netif->stats.tx_packets++; - pending_id[pending_idx] = txreq.id; pending_cons++; netif_rx(skb); @@ -376,28 +389,34 @@ static void net_tx_action(unsigned long unused) } } -/* Destructor function for tx skbs. */ -static void tx_skb_release(struct sk_buff *skb) +static void netif_page_release(struct page *page) { unsigned long flags; - netif_t *netif = (netif_t *)skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page; - u16 pending_idx = skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size; + netif_t *netif; + u16 pending_idx; + + pending_idx = page - virt_to_page(mmap_vstart); + + netif = pending_netif[pending_idx]; vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE); - - skb_shinfo(skb)->nr_frags = 0; - + spin_lock(&netif->tx_lock); make_tx_response(netif, pending_id[pending_idx], NETIF_RSP_OKAY); spin_unlock(&netif->tx_lock); - + + /* + * Scheduling checks must happen after the above response is posted. + * This avoids a possible race with a guest OS on another CPU. + */ + mb(); + netif_schedule_work(netif); + netif_put(netif); spin_lock_irqsave(&pend_prod_lock, flags); pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; spin_unlock_irqrestore(&pend_prod_lock, flags); - - maybe_schedule_tx_action(); } #if 0 @@ -493,9 +512,26 @@ static void make_rx_response(netif_t *netif, static int __init init_module(void) { + int i; + + if ( !(start_info.flags & SIF_INITDOMAIN) ) + return 0; + netif_interface_init(); - mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS); + + if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 ) + BUG(); + + pending_cons = 0; + pending_prod = MAX_PENDING_REQS; + for ( i = 0; i < MAX_PENDING_REQS; i++ ) + pending_ring[i] = i; + + spin_lock_init(&net_schedule_list_lock); + INIT_LIST_HEAD(&net_schedule_list); + netif_ctrlif_init(); + return 0; } diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c index af8e660b7c..cc5ac31e82 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c +++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c @@ -25,20 +25,18 @@ #include <net/sock.h> #include <net/pkt_sched.h> -#include "../netif.h" +#include <asm/evtchn.h> +#include <asm/ctrl_if.h> +#include <asm/hypervisor-ifs/dom_mem_ops.h> -static struct tq_struct netif_statechange_tq; +#include "../netif.h" #define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */ -static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs); static void network_tx_buf_gc(struct net_device *dev); static void network_alloc_rx_buffers(struct net_device *dev); static void cleanup_module(void); -/* Dynamically-mapped IRQs. */ -static int network_irq, debug_irq; - static struct list_head dev_list; struct net_private @@ -47,7 +45,7 @@ struct net_private struct net_device *dev; struct net_device_stats stats; - NET_RING_IDX rx_resp_cons, tx_resp_cons; + NETIF_RING_IDX rx_resp_cons, tx_resp_cons; unsigned int tx_full; netif_tx_interface_t *tx; @@ -69,8 +67,8 @@ struct net_private * {tx,rx}_skbs store outstanding skbuffs. The first entry in each * array is an index into a chain of free entries. */ - struct sk_buff *tx_skbs[XENNET_TX_RING_SIZE+1]; - struct sk_buff *rx_skbs[XENNET_RX_RING_SIZE+1]; + struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1]; + struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1]; }; /* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ @@ -91,7 +89,7 @@ static struct net_device *find_dev_by_handle(unsigned int handle) { np = list_entry(ent, struct net_private, list); if ( np->handle == handle ) - return np; + return np->dev; } return NULL; } @@ -100,8 +98,7 @@ static struct net_device *find_dev_by_handle(unsigned int handle) static int network_open(struct net_device *dev) { struct net_private *np = dev->priv; - netop_t netop; - int i, ret; + int i; if ( np->state != NETIF_STATE_CONNECTED ) return -EINVAL; @@ -111,15 +108,16 @@ static int network_open(struct net_device *dev) spin_lock_init(&np->tx_lock); /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ - for ( i = 0; i <= XENNET_TX_RING_SIZE; i++ ) + for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ ) np->tx_skbs[i] = (void *)(i+1); - for ( i = 0; i <= XENNET_RX_RING_SIZE; i++ ) + for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ ) np->rx_skbs[i] = (void *)(i+1); wmb(); np->state = NETIF_STATE_ACTIVE; network_alloc_rx_buffers(dev); + np->rx->event = np->rx_resp_cons + 1; netif_start_queue(dev); @@ -131,18 +129,17 @@ static int network_open(struct net_device *dev) static void network_tx_buf_gc(struct net_device *dev) { - NET_RING_IDX i, prod; + NETIF_RING_IDX i, prod; unsigned short id; struct net_private *np = dev->priv; struct sk_buff *skb; - tx_entry_t *tx_ring = np->net_ring->tx_ring; do { - prod = np->net_idx->tx_resp_prod; + prod = np->tx->resp_prod; for ( i = np->tx_resp_cons; i != prod; i++ ) { - id = tx_ring[MASK_NET_TX_IDX(i)].resp.id; + id = np->tx->ring[MASK_NET_TX_IDX(i)].resp.id; skb = np->tx_skbs[id]; ADD_ID_TO_FREELIST(np->tx_skbs, id); dev_kfree_skb_any(skb); @@ -158,14 +155,14 @@ static void network_tx_buf_gc(struct net_device *dev) * in such cases notification from Xen is likely to be the only kick * that we'll get. */ - np->net_idx->tx_event = - prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1; + np->tx->event = + prod + ((np->tx->req_prod - prod) >> 1) + 1; mb(); } - while ( prod != np->net_idx->tx_resp_prod ); + while ( prod != np->tx->resp_prod ); if ( np->tx_full && - ((np->net_idx->tx_req_prod - prod) < XENNET_TX_RING_SIZE) ) + ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) ) { np->tx_full = 0; if ( np->state == NETIF_STATE_ACTIVE ) @@ -189,10 +186,14 @@ static void network_alloc_rx_buffers(struct net_device *dev) unsigned short id; struct net_private *np = dev->priv; struct sk_buff *skb; - netop_t netop; - NET_RING_IDX i = np->net_idx->rx_req_prod; - - if ( unlikely((i - np->rx_resp_cons) == XENNET_RX_RING_SIZE) || + NETIF_RING_IDX i = np->rx->req_prod; + dom_mem_op_t op; + unsigned long pfn_array[NETIF_RX_RING_SIZE]; + int ret, nr_pfns = 0; + pte_t *pte; + + /* Make sure the batch is large enough to be worthwhile (1/2 ring). */ + if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) || unlikely(np->state != NETIF_STATE_ACTIVE) ) return; @@ -209,13 +210,13 @@ static void network_alloc_rx_buffers(struct net_device *dev) id = GET_ID_FROM_FREELIST(np->rx_skbs); np->rx_skbs[id] = skb; - np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id = id; - np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = - virt_to_machine(get_ppte(skb->head)); - - np->rx_bufs_to_notify++; + np->rx->ring[MASK_NET_RX_IDX(i)].req.id = id; + + pte = get_ppte(skb->head); + pfn_array[nr_pfns++] = pte->pte_low >> PAGE_SHIFT; + queue_l1_entry_update(pte, 0); } - while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE ); + while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE ); /* * We may have allocated buffers which have entries outstanding in the page @@ -223,17 +224,16 @@ static void network_alloc_rx_buffers(struct net_device *dev) */ flush_page_update_queue(); - np->net_idx->rx_req_prod = i; - np->net_idx->rx_event = np->rx_resp_cons + 1; - - /* Batch Xen notifications. */ - if ( np->rx_bufs_to_notify > (XENNET_RX_RING_SIZE/4) ) + op.op = MEMOP_RESERVATION_DECREASE; + op.u.decrease.size = nr_pfns; + op.u.decrease.pages = pfn_array; + if ( (ret = HYPERVISOR_dom_mem_op(&op)) != nr_pfns ) { - netop.cmd = NETOP_PUSH_BUFFERS; - netop.vif = np->idx; - (void)HYPERVISOR_net_io_op(&netop); - np->rx_bufs_to_notify = 0; + printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret); + BUG(); } + + np->rx->req_prod = i; } @@ -241,9 +241,8 @@ static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned short id; struct net_private *np = (struct net_private *)dev->priv; - tx_req_entry_t *tx; - netop_t netop; - NET_RING_IDX i; + netif_tx_request_t *tx; + NETIF_RING_IDX i; if ( unlikely(np->tx_full) ) { @@ -262,27 +261,27 @@ static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) memcpy(new_skb->data, skb->data, skb->len); dev_kfree_skb(skb); skb = new_skb; - } + } spin_lock_irq(&np->tx_lock); - i = np->net_idx->tx_req_prod; + i = np->tx->req_prod; id = GET_ID_FROM_FREELIST(np->tx_skbs); np->tx_skbs[id] = skb; - tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req; + tx = &np->tx->ring[MASK_NET_TX_IDX(i)].req; tx->id = id; - tx->addr = phys_to_machine(virt_to_phys(skb->data)); + tx->addr = virt_to_machine(skb->data); tx->size = skb->len; wmb(); - np->net_idx->tx_req_prod = i + 1; + np->tx->req_prod = i + 1; network_tx_buf_gc(dev); - if ( (i - np->tx_resp_cons) == (XENNET_TX_RING_SIZE - 1) ) + if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) ) { np->tx_full = 1; netif_stop_queue(dev); @@ -295,12 +294,8 @@ static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) /* Only notify Xen if there are no outstanding responses. */ mb(); - if ( np->net_idx->tx_resp_prod == i ) - { - netop.cmd = NETOP_PUSH_BUFFERS; - netop.vif = np->idx; - (void)HYPERVISOR_net_io_op(&netop); - } + if ( np->tx->resp_prod == i ) + notify_via_evtchn(np->evtchn); return 0; } @@ -312,22 +307,24 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs) struct net_private *np = dev->priv; unsigned long flags; struct sk_buff *skb; - rx_resp_entry_t *rx; - NET_RING_IDX i; + netif_rx_response_t *rx; + NETIF_RING_IDX i; + mmu_update_t mmu[2]; + pte_t *pte; spin_lock_irqsave(&np->tx_lock, flags); network_tx_buf_gc(dev); spin_unlock_irqrestore(&np->tx_lock, flags); again: - for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ ) + for ( i = np->rx_resp_cons; i != np->rx->resp_prod; i++ ) { - rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp; + rx = &np->rx->ring[MASK_NET_RX_IDX(i)].resp; skb = np->rx_skbs[rx->id]; ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); - if ( unlikely(rx->status != RING_STATUS_OK) ) + if ( unlikely(rx->status <= 0) ) { /* Gate this error. We get a (valid) slew of them on suspend. */ if ( np->state == NETIF_STATE_ACTIVE ) @@ -336,6 +333,17 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs) continue; } + /* Remap the page. */ + pte = get_ppte(skb->head); + mmu[0].ptr = virt_to_machine(pte); + mmu[0].val = (rx->addr & PAGE_MASK) | __PAGE_KERNEL; + mmu[1].ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE; + mmu[1].val = __pa(skb->head) >> PAGE_SHIFT; + if ( HYPERVISOR_mmu_update(mmu, 2) != 0 ) + BUG(); + phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = + rx->addr >> PAGE_SHIFT; + /* * Set up shinfo -- from alloc_skb This was particularily nasty: the * shared info is hidden at the back of the data area (presumably so it @@ -348,13 +356,13 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs) phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] = (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT; - skb->data = skb->tail = skb->head + rx->offset; - skb_put(skb, rx->size); + skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK); + skb_put(skb, rx->status); skb->protocol = eth_type_trans(skb, dev); np->stats.rx_packets++; - np->stats.rx_bytes += rx->size; + np->stats.rx_bytes += rx->status; netif_rx(skb); dev->last_rx = jiffies; } @@ -362,10 +370,11 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs) np->rx_resp_cons = i; network_alloc_rx_buffers(dev); + np->rx->event = np->rx_resp_cons + 1; /* Deal with hypervisor racing our resetting of rx_event. */ mb(); - if ( np->net_idx->rx_resp_prod != i ) + if ( np->rx->resp_prod != i ) goto again; } @@ -373,16 +382,11 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs) static int network_close(struct net_device *dev) { struct net_private *np = dev->priv; - netop_t netop; netif_stop_queue(np->dev); - netop.cmd = NETOP_FLUSH_BUFFERS; - netop.vif = np->idx; - (void)HYPERVISOR_net_io_op(&netop); - - while ( (np->rx_resp_cons != np->net_idx->rx_req_prod) || - (np->tx_resp_cons != np->net_idx->tx_req_prod) ) + while ( (np->rx_resp_cons != np->rx->req_prod) || + (np->tx_resp_cons != np->tx->req_prod) ) { barrier(); current->state = TASK_INTERRUPTIBLE; @@ -406,55 +410,12 @@ static struct net_device_stats *network_get_stats(struct net_device *dev) } -static void netif_bringup_phase1(void *unused) +static void netif_status_change(netif_fe_interface_status_changed_t *status) { ctrl_msg_t cmsg; netif_fe_interface_connect_t up; struct net_device *dev; struct net_private *np; - - dev = find_dev_by_handle(0); - np = dev->priv; - - /* Move from CLOSED to DISCONNECTED state. */ - np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); - np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL); - memset(np->tx, 0, PAGE_SIZE); - memset(np->rx, 0, PAGE_SIZE); - np->state = NETIF_STATE_DISCONNECTED; - - /* Construct an interface-CONNECT message for the domain controller. */ - cmsg.type = CMSG_NETIF_FE; - cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT; - cmsg.length = sizeof(netif_fe_interface_connect_t); - up.handle = 0; - up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT; - up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT; - memcpy(cmsg.msg, &up, sizeof(up)); - - /* Tell the controller to bring up the interface. */ - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -static void netif_bringup_phase2(void *unused) -{ - struct net_device *dev; - struct net_private *np; - - dev = find_dev_by_handle(0); - np = dev->priv; - - np->irq = bind_evtchn_to_irq(np->evtchn); - (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, - "netif", dev); - - np->state = NETIF_STATE_CONNECTED; -} - -static void netif_status_change(netif_fe_interface_status_changed_t *status) -{ - struct net_device *dev; - struct net_private *np; if ( status->handle != 0 ) { @@ -470,31 +431,53 @@ static void netif_status_change(netif_fe_interface_status_changed_t *status) { case NETIF_INTERFACE_STATUS_DESTROYED: printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n", - netif_state); + np->state); break; case NETIF_INTERFACE_STATUS_DISCONNECTED: if ( np->state != NETIF_STATE_CLOSED ) { printk(KERN_WARNING "Unexpected netif-DISCONNECTED message" - " in state %d\n", netif_state); + " in state %d\n", np->state); break; } - netif_statechange_tq.routine = netif_bringup_phase1; - schedule_task(&netif_statechange_tq); + + /* Move from CLOSED to DISCONNECTED state. */ + np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); + np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL); + memset(np->tx, 0, PAGE_SIZE); + memset(np->rx, 0, PAGE_SIZE); + np->state = NETIF_STATE_DISCONNECTED; + + /* Construct an interface-CONNECT message for the domain controller. */ + cmsg.type = CMSG_NETIF_FE; + cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT; + cmsg.length = sizeof(netif_fe_interface_connect_t); + up.handle = 0; + up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT; + up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT; + memcpy(cmsg.msg, &up, sizeof(up)); + + /* Tell the controller to bring up the interface. */ + ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); break; case NETIF_INTERFACE_STATUS_CONNECTED: if ( np->state == NETIF_STATE_CLOSED ) { printk(KERN_WARNING "Unexpected netif-CONNECTED message" - " in state %d\n", netif_state); + " in state %d\n", np->state); break; } - np->evtchn = status->evtchn; + memcpy(dev->dev_addr, status->mac, ETH_ALEN); - netif_statechange_tq.routine = netif_bringup_phase2; - schedule_task(&netif_statechange_tq); + + np->evtchn = status->evtchn; + np->irq = bind_evtchn_to_irq(np->evtchn); + (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, + dev->name, dev); + + np->state = NETIF_STATE_CONNECTED; break; default: @@ -532,10 +515,13 @@ static int __init init_module(void) { ctrl_msg_t cmsg; netif_fe_driver_status_changed_t st; - int i, err; + int err; struct net_device *dev; struct net_private *np; + if ( start_info.flags & SIF_INITDOMAIN ) + return 0; + INIT_LIST_HEAD(&dev_list); if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL ) @@ -562,7 +548,8 @@ static int __init init_module(void) np->dev = dev; list_add(&np->list, &dev_list); - (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx); + (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx, + CALLBACK_IN_BLOCKING_CONTEXT); /* Send a driver-UP notification to the domain controller. */ cmsg.type = CMSG_NETIF_FE; diff --git a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c index 715f707eb0..19cb9a3326 100644 --- a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c +++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c @@ -33,8 +33,19 @@ static struct irqaction ctrl_if_irq_action; static CONTROL_RING_IDX ctrl_if_tx_resp_cons; static CONTROL_RING_IDX ctrl_if_rx_req_cons; -/* Incoming message requests: primary message type -> message handler. */ +/* Incoming message requests. */ + /* Primary message type -> message handler. */ static ctrl_msg_handler_t ctrl_if_rxmsg_handler[256]; + /* Primary message type -> callback in process context? */ +static unsigned long ctrl_if_rxmsg_blocking_context[256/sizeof(unsigned long)]; + /* Is it late enough during bootstrap to use schedule_task()? */ +static int safe_to_schedule_task; + /* Passed to schedule_task(). */ +static struct tq_struct ctrl_if_rxmsg_deferred_tq; + /* Queue up messages to be handled in process context. */ +static ctrl_msg_t ctrl_if_rxmsg_deferred[CONTROL_RING_SIZE]; +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_prod; +static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_cons; /* Incoming message responses: message identifier -> message handler/id. */ static struct { @@ -99,22 +110,40 @@ static void __ctrl_if_tx_tasklet(unsigned long data) } } +static void __ctrl_if_rxmsg_deferred(void *unused) +{ + ctrl_msg_t *msg; + + while ( ctrl_if_rxmsg_deferred_cons != ctrl_if_rxmsg_deferred_prod ) + { + msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX( + ctrl_if_rxmsg_deferred_cons++)]; + (*ctrl_if_rxmsg_handler[msg->type])(msg, 0); + } +} + static void __ctrl_if_rx_tasklet(unsigned long data) { control_if_t *ctrl_if = get_ctrl_if(); - ctrl_msg_t *msg; + ctrl_msg_t msg, *pmsg; while ( ctrl_if_rx_req_cons != ctrl_if->rx_req_prod ) { - /* - * We need no locking or barriers here. There will be one and only one - * response as a result of each callback, so the callback handler - * doesn't need to worry about the 'msg' being overwritten until: - * 1. It returns (if the message must persist then it must be copied). - * 2. A response is sent (the response may overwrite the request). - */ - msg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)]; - (*ctrl_if_rxmsg_handler[msg->type])(msg, 0); + pmsg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)]; + memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg)); + if ( msg.length != 0 ) + memcpy(msg.msg, pmsg->msg, msg.length); + if ( test_bit(msg.type, &ctrl_if_rxmsg_blocking_context) ) + { + pmsg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX( + ctrl_if_rxmsg_deferred_prod++)]; + memcpy(pmsg, &msg, offsetof(ctrl_msg_t, msg) + msg.length); + schedule_task(&ctrl_if_rxmsg_deferred_tq); + } + else + { + (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0); + } } } @@ -243,22 +272,36 @@ void ctrl_if_send_response(ctrl_msg_t *msg) ctrl_if_notify_controller(); } -int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd) +int ctrl_if_register_receiver( + u8 type, + ctrl_msg_handler_t hnd, + unsigned int flags) { - unsigned long flags; + unsigned long _flags; int inuse; - spin_lock_irqsave(&ctrl_if_lock, flags); + spin_lock_irqsave(&ctrl_if_lock, _flags); inuse = (ctrl_if_rxmsg_handler[type] != ctrl_if_rxmsg_default_handler); if ( inuse ) + { printk(KERN_INFO "Receiver %p already established for control " "messages of type %d.\n", ctrl_if_rxmsg_handler[type], type); + } else + { ctrl_if_rxmsg_handler[type] = hnd; + clear_bit(type, &ctrl_if_rxmsg_blocking_context); + if ( flags == CALLBACK_IN_BLOCKING_CONTEXT ) + { + set_bit(type, &ctrl_if_rxmsg_blocking_context); + if ( !safe_to_schedule_task ) + BUG(); + } + } - spin_unlock_irqrestore(&ctrl_if_lock, flags); + spin_unlock_irqrestore(&ctrl_if_lock, _flags); return !inuse; } @@ -326,6 +369,7 @@ void __init ctrl_if_init(void) for ( i = 0; i < 256; i++ ) ctrl_if_rxmsg_handler[i] = ctrl_if_rxmsg_default_handler; + ctrl_if_rxmsg_deferred_tq.routine = __ctrl_if_rxmsg_deferred; spin_lock_init(&ctrl_if_lock); @@ -333,6 +377,15 @@ void __init ctrl_if_init(void) } +/* This is called after it is safe to call schedule_task(). */ +static int __init ctrl_if_late_setup(void) +{ + safe_to_schedule_task = 1; + return 0; +} +__initcall(ctrl_if_late_setup); + + /* * !! The following are DANGEROUS FUNCTIONS !! * Use with care [for example, see xencons_force_flush()]. diff --git a/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c b/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c index 20a934addd..d219c28403 100644 --- a/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c +++ b/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c @@ -1626,7 +1626,7 @@ int __init blk_dev_init(void) jsfd_init(); #endif -#ifdef CONFIG_XEN_VBD +#if defined(CONFIG_XEN_VBD) || defined(CONFIG_XEN_NEWIO) xlblk_init(); #endif diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h index a02e2471ea..5bc6cc22b1 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h @@ -80,8 +80,14 @@ void ctrl_if_send_response(ctrl_msg_t *msg); * Register a receiver for typed messages from the domain controller. The * handler (@hnd) is called for every received message of specified @type. * Returns TRUE (non-zero) if the handler was successfully registered. + * If CALLBACK_IN_BLOCKING CONTEXT is specified in @flags then callbacks will + * occur in a context in which it is safe to yield (i.e., process context). */ -int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd); +#define CALLBACK_IN_BLOCKING_CONTEXT 1 +int ctrl_if_register_receiver( + u8 type, + ctrl_msg_handler_t hnd, + unsigned int flags); /* * Unregister a receiver for typed messages from the domain controller. The diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/io.h b/xenolinux-2.4.26-sparse/include/asm-xen/io.h index f5243bb6a7..5ab5fe9bfc 100644 --- a/xenolinux-2.4.26-sparse/include/asm-xen/io.h +++ b/xenolinux-2.4.26-sparse/include/asm-xen/io.h @@ -159,46 +159,11 @@ extern void iounmap(void *addr); extern void *bt_ioremap(unsigned long offset, unsigned long size); extern void bt_iounmap(void *addr, unsigned long size); -#ifdef CONFIG_XEN_PHYSDEV_ACCESS - -#ifdef CONFIG_HIGHMEM -#error "Highmem is not yet compatible with physical device access" -#endif - -/* - * The bus translation macros need special care if we are executing device - * accesses to/from other domains' memory. In these cases the virtual address - * is actually a temporary mapping in the 'vmalloc' space. The physical - * address will therefore be >max_low_pfn, and will not have a valid entry - * in the phys_to_mach mapping table. - */ -static inline unsigned long phys_to_bus(unsigned long phys) -{ - extern unsigned long max_pfn; - pgd_t *pgd; pmd_t *pmd; pte_t *pte; - void *addr; - unsigned long bus; - if ( (phys >> PAGE_SHIFT) < max_pfn ) - return phys_to_machine(phys); - addr = phys_to_virt(phys); - pgd = pgd_offset_k( (unsigned long)addr); - pmd = pmd_offset(pgd, (unsigned long)addr); - pte = pte_offset(pmd, (unsigned long)addr); - bus = (pte->pte_low & PAGE_MASK) | (phys & ~PAGE_MASK); - return bus; -} - -#define virt_to_bus(_x) phys_to_bus(virt_to_phys(_x)) -#define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x)) -#define page_to_bus(_x) phys_to_bus(page_to_phys(_x)) - -#else - #define virt_to_bus(_x) phys_to_machine(virt_to_phys(_x)) #define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x)) #define page_to_bus(_x) phys_to_machine(page_to_phys(_x)) - -#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ +#define bus_to_phys(_x) machine_to_phys(_x) +#define bus_to_page(_x) (mem_map + (bus_to_phys(_x) >> PAGE_SHIFT)) /* * readX/writeX() are used to access memory mapped devices. On some diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/pci.h b/xenolinux-2.4.26-sparse/include/asm-xen/pci.h new file mode 100644 index 0000000000..74ae5ba8b1 --- /dev/null +++ b/xenolinux-2.4.26-sparse/include/asm-xen/pci.h @@ -0,0 +1,283 @@ +#ifndef __i386_PCI_H +#define __i386_PCI_H + +#include <linux/config.h> + +#ifdef __KERNEL__ + +/* Can be used to override the logic in pci_scan_bus for skipping + already-configured bus numbers - to be used for buggy BIOSes + or architectures with incomplete PCI setup by the loader */ + +#ifdef CONFIG_PCI +extern unsigned int pcibios_assign_all_busses(void); +#else +#define pcibios_assign_all_busses() 0 +#endif +#define pcibios_scan_all_fns() 0 + +extern unsigned long pci_mem_start; +#define PCIBIOS_MIN_IO 0x1000 +#define PCIBIOS_MIN_MEM (pci_mem_start) + +void pcibios_config_init(void); +struct pci_bus * pcibios_scan_root(int bus); +extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value); +extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); + +void pcibios_set_master(struct pci_dev *dev); +void pcibios_penalize_isa_irq(int irq); +struct irq_routing_table *pcibios_get_irq_routing_table(void); +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); + +/* Dynamic DMA mapping stuff. + * i386 has everything mapped statically. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <asm/scatterlist.h> +#include <linux/string.h> +#include <asm/io.h> + +struct pci_dev; + +/* The networking and block device layers use this boolean for bounce + * buffer decisions. + */ +#define PCI_DMA_BUS_IS_PHYS (0) + +/* Allocate and map kernel buffer using consistent mode DMA for a device. + * hwdev should be valid struct pci_dev pointer for PCI devices, + * NULL for PCI-like buses (ISA, EISA). + * Returns non-NULL cpu-view pointer to the buffer if successful and + * sets *dma_addrp to the pci side dma address as well, else *dma_addrp + * is undefined. + */ +extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, + dma_addr_t *dma_handle); + +/* Free and unmap a consistent DMA buffer. + * cpu_addr is what was returned from pci_alloc_consistent, + * size must be the same as what as passed into pci_alloc_consistent, + * and likewise dma_addr must be the same as what *dma_addrp was set to. + * + * References to the memory and mappings associated with cpu_addr/dma_addr + * past this call are illegal. + */ +extern void pci_free_consistent(struct pci_dev *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle); + +/* Map a single buffer of the indicated size for DMA in streaming mode. + * The 32-bit bus address to use is returned. + * + * Once the device is given the dma address, the device owns this memory + * until either pci_unmap_single or pci_dma_sync_single is performed. + */ +static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, + size_t size, int direction) +{ + if (direction == PCI_DMA_NONE) + out_of_line_bug(); + flush_write_buffers(); + return virt_to_bus(ptr); +} + +/* Unmap a single streaming mode DMA translation. The dma_addr and size + * must match what was provided for in a previous pci_map_single call. All + * other usages are undefined. + * + * After this call, reads by the cpu to the buffer are guarenteed to see + * whatever the device wrote there. + */ +static inline void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, + size_t size, int direction) +{ + if (direction == PCI_DMA_NONE) + out_of_line_bug(); + /* Nothing to do */ +} + +/* + * pci_{map,unmap}_single_page maps a kernel page to a dma_addr_t. identical + * to pci_map_single, but takes a struct page instead of a virtual address + */ +static inline dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page, + unsigned long offset, size_t size, int direction) +{ + if (direction == PCI_DMA_NONE) + out_of_line_bug(); + + return page_to_bus(page) + offset; +} + +static inline void pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address, + size_t size, int direction) +{ + if (direction == PCI_DMA_NONE) + out_of_line_bug(); + /* Nothing to do */ +} + +/* pci_unmap_{page,single} is a nop so... */ +#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) +#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) +#define pci_unmap_addr(PTR, ADDR_NAME) (0) +#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) +#define pci_unmap_len(PTR, LEN_NAME) (0) +#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) + +/* Map a set of buffers described by scatterlist in streaming + * mode for DMA. This is the scather-gather version of the + * above pci_map_single interface. Here the scatter gather list + * elements are each tagged with the appropriate dma address + * and length. They are obtained via sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for pci_map_single are + * the same here. + */ +static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, + int nents, int direction) +{ + int i; + + if (direction == PCI_DMA_NONE) + out_of_line_bug(); + + /* + * temporary 2.4 hack + */ + for (i = 0; i < nents; i++ ) { + if (sg[i].address && sg[i].page) + out_of_line_bug(); + else if (!sg[i].address && !sg[i].page) + out_of_line_bug(); + + if (sg[i].address) + sg[i].dma_address = virt_to_bus(sg[i].address); + else + sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset; + } + + flush_write_buffers(); + return nents; +} + +/* Unmap a set of streaming mode DMA translations. + * Again, cpu read rules concerning calls here are the same as for + * pci_unmap_single() above. + */ +static inline void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, + int nents, int direction) +{ + if (direction == PCI_DMA_NONE) + out_of_line_bug(); + /* Nothing to do */ +} + +/* Make physical memory consistent for a single + * streaming mode DMA translation after a transfer. + * + * If you perform a pci_map_single() but wish to interrogate the + * buffer using the cpu, yet do not wish to teardown the PCI dma + * mapping, you must call this function before doing so. At the + * next point you give the PCI dma address back to the card, the + * device again owns the buffer. + */ +static inline void pci_dma_sync_single(struct pci_dev *hwdev, + dma_addr_t dma_handle, + size_t size, int direction) +{ + if (direction == PCI_DMA_NONE) + out_of_line_bug(); + flush_write_buffers(); +} + +/* Make physical memory consistent for a set of streaming + * mode DMA translations after a transfer. + * + * The same as pci_dma_sync_single but for a scatter-gather list, + * same rules and usage. + */ +static inline void pci_dma_sync_sg(struct pci_dev *hwdev, + struct scatterlist *sg, + int nelems, int direction) +{ + if (direction == PCI_DMA_NONE) + out_of_line_bug(); + flush_write_buffers(); +} + +/* Return whether the given PCI device DMA address mask can + * be supported properly. For example, if your device can + * only drive the low 24-bits during PCI bus mastering, then + * you would pass 0x00ffffff as the mask to this function. + */ +static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask) +{ + /* + * we fall back to GFP_DMA when the mask isn't all 1s, + * so we can't guarantee allocations that must be + * within a tighter range than GFP_DMA.. + */ + if(mask < 0x00ffffff) + return 0; + + return 1; +} + +/* This is always fine. */ +#define pci_dac_dma_supported(pci_dev, mask) (1) + +static __inline__ dma64_addr_t +pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction) +{ + return ((dma64_addr_t) page_to_bus(page) + + (dma64_addr_t) offset); +} + +static __inline__ struct page * +pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) +{ + return bus_to_page(dma_addr); +} + +static __inline__ unsigned long +pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) +{ + return (dma_addr & ~PAGE_MASK); +} + +static __inline__ void +pci_dac_dma_sync_single(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) +{ + flush_write_buffers(); +} + +/* These macros should be used after a pci_map_sg call has been done + * to get bus addresses of each of the SG entries and their lengths. + * You should only work with the number of sg entries pci_map_sg + * returns. + */ +#define sg_dma_address(sg) ((sg)->dma_address) +#define sg_dma_len(sg) ((sg)->length) + +/* Return the index of the PCI controller for device. */ +static inline int pci_controller_num(struct pci_dev *dev) +{ + return 0; +} + +#define HAVE_PCI_MMAP +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, int write_combine); + +#endif /* __KERNEL__ */ + +#endif /* __i386_PCI_H */ diff --git a/xenolinux-2.4.26-sparse/mkbuildtree b/xenolinux-2.4.26-sparse/mkbuildtree index 46fe4784ad..2e9f7b9920 100755 --- a/xenolinux-2.4.26-sparse/mkbuildtree +++ b/xenolinux-2.4.26-sparse/mkbuildtree @@ -163,7 +163,6 @@ ln -sf ../asm-i386/mtrr.h ln -sf ../asm-i386/namei.h ln -sf ../asm-i386/param.h ln -sf ../asm-i386/parport.h -ln -sf ../asm-i386/pci.h ln -sf ../asm-i386/pgtable-3level.h ln -sf ../asm-i386/poll.h ln -sf ../asm-i386/posix_types.h diff --git a/xenolinux-2.4.26-sparse/mm/page_alloc.c b/xenolinux-2.4.26-sparse/mm/page_alloc.c new file mode 100644 index 0000000000..62ed7751a5 --- /dev/null +++ b/xenolinux-2.4.26-sparse/mm/page_alloc.c @@ -0,0 +1,930 @@ +/* + * linux/mm/page_alloc.c + * + * Manages the free list, the system allocates free pages here. + * Note that kmalloc() lives in slab.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/swapctl.h> +#include <linux/interrupt.h> +#include <linux/pagemap.h> +#include <linux/bootmem.h> +#include <linux/slab.h> +#include <linux/module.h> + +int nr_swap_pages; +int nr_active_pages; +int nr_inactive_pages; +LIST_HEAD(inactive_list); +LIST_HEAD(active_list); +pg_data_t *pgdat_list; + +/* + * + * The zone_table array is used to look up the address of the + * struct zone corresponding to a given zone number (ZONE_DMA, + * ZONE_NORMAL, or ZONE_HIGHMEM). + */ +zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; +EXPORT_SYMBOL(zone_table); + +static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; +static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; +static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; +static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; +static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; + +int vm_gfp_debug = 0; + +/* + * Temporary debugging check. + */ +#define BAD_RANGE(zone, page) \ +( \ + (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \ + || (((page) - mem_map) < (zone)->zone_start_mapnr) \ + || ((zone) != page_zone(page)) \ +) + +/* + * Freeing function for a buddy system allocator. + * Contrary to prior comments, this is *NOT* hairy, and there + * is no reason for anyone not to understand it. + * + * The concept of a buddy system is to maintain direct-mapped tables + * (containing bit values) for memory blocks of various "orders". + * The bottom level table contains the map for the smallest allocatable + * units of memory (here, pages), and each level above it describes + * pairs of units from the levels below, hence, "buddies". + * At a high level, all that happens here is marking the table entry + * at the bottom level available, and propagating the changes upward + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep one bit for each pair of blocks, which + * is set to 1 iff only one of the pair is allocated. So when we + * are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. + * If a block is freed, and its buddy is also free, then this + * triggers coalescing into a block of larger size. + * + * -- wli + */ + +static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order)); +static void __free_pages_ok (struct page *page, unsigned int order) +{ + unsigned long index, page_idx, mask, flags; + free_area_t *area; + struct page *base; + zone_t *zone; + + /* + * Yes, think what happens when other parts of the kernel take + * a reference to a page in order to pin it for io. -ben + */ + if (PageLRU(page)) { + if (unlikely(in_interrupt())) + BUG(); + lru_cache_del(page); + } + + if (page->buffers) + BUG(); + if (page->mapping) + return (*(void(*)(struct page *))page->mapping)(page); + if (!VALID_PAGE(page)) + BUG(); + if (PageLocked(page)) + BUG(); + if (PageActive(page)) + BUG(); + ClearPageReferenced(page); + ClearPageDirty(page); + + if (current->flags & PF_FREE_PAGES) + goto local_freelist; + back_local_freelist: + + zone = page_zone(page); + + mask = (~0UL) << order; + base = zone->zone_mem_map; + page_idx = page - base; + if (page_idx & ~mask) + BUG(); + index = page_idx >> (1 + order); + + area = zone->free_area + order; + + spin_lock_irqsave(&zone->lock, flags); + + zone->free_pages -= mask; + + while (mask + (1 << (MAX_ORDER-1))) { + struct page *buddy1, *buddy2; + + if (area >= zone->free_area + MAX_ORDER) + BUG(); + if (!__test_and_change_bit(index, area->map)) + /* + * the buddy page is still allocated. + */ + break; + /* + * Move the buddy up one level. + * This code is taking advantage of the identity: + * -mask = 1+~mask + */ + buddy1 = base + (page_idx ^ -mask); + buddy2 = base + page_idx; + if (BAD_RANGE(zone,buddy1)) + BUG(); + if (BAD_RANGE(zone,buddy2)) + BUG(); + + list_del(&buddy1->list); + mask <<= 1; + area++; + index >>= 1; + page_idx &= mask; + } + list_add(&(base + page_idx)->list, &area->free_list); + + spin_unlock_irqrestore(&zone->lock, flags); + return; + + local_freelist: + if (current->nr_local_pages) + goto back_local_freelist; + if (in_interrupt()) + goto back_local_freelist; + + list_add(&page->list, ¤t->local_pages); + page->index = order; + current->nr_local_pages++; +} + +#define MARK_USED(index, order, area) \ + __change_bit((index) >> (1+(order)), (area)->map) + +static inline struct page * expand (zone_t *zone, struct page *page, + unsigned long index, int low, int high, free_area_t * area) +{ + unsigned long size = 1 << high; + + while (high > low) { + if (BAD_RANGE(zone,page)) + BUG(); + area--; + high--; + size >>= 1; + list_add(&(page)->list, &(area)->free_list); + MARK_USED(index, high, area); + index += size; + page += size; + } + if (BAD_RANGE(zone,page)) + BUG(); + return page; +} + +static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order)); +static struct page * rmqueue(zone_t *zone, unsigned int order) +{ + free_area_t * area = zone->free_area + order; + unsigned int curr_order = order; + struct list_head *head, *curr; + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&zone->lock, flags); + do { + head = &area->free_list; + curr = head->next; + + if (curr != head) { + unsigned int index; + + page = list_entry(curr, struct page, list); + if (BAD_RANGE(zone,page)) + BUG(); + list_del(curr); + index = page - zone->zone_mem_map; + if (curr_order != MAX_ORDER-1) + MARK_USED(index, curr_order, area); + zone->free_pages -= 1UL << order; + + page = expand(zone, page, index, order, curr_order, area); + spin_unlock_irqrestore(&zone->lock, flags); + + set_page_count(page, 1); + if (BAD_RANGE(zone,page)) + BUG(); + if (PageLRU(page)) + BUG(); + if (PageActive(page)) + BUG(); + return page; + } + curr_order++; + area++; + } while (curr_order < MAX_ORDER); + spin_unlock_irqrestore(&zone->lock, flags); + + return NULL; +} + +#ifndef CONFIG_DISCONTIGMEM +struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order) +{ + return __alloc_pages(gfp_mask, order, + contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); +} +#endif + +static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); +static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) +{ + struct page * page = NULL; + int __freed; + + if (in_interrupt()) + BUG(); + + current->allocation_order = order; + current->flags |= PF_MEMALLOC | PF_FREE_PAGES; + + __freed = try_to_free_pages_zone(classzone, gfp_mask); + + current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); + + if (current->nr_local_pages) { + struct list_head * entry, * local_pages; + struct page * tmp; + int nr_pages; + + local_pages = ¤t->local_pages; + + if (likely(__freed)) { + /* pick from the last inserted so we're lifo */ + entry = local_pages->next; + do { + tmp = list_entry(entry, struct page, list); + if (tmp->index == order && memclass(page_zone(tmp), classzone)) { + list_del(entry); + current->nr_local_pages--; + set_page_count(tmp, 1); + page = tmp; + + if (page->buffers) + BUG(); + if (page->mapping) + BUG(); + if (!VALID_PAGE(page)) + BUG(); + if (PageLocked(page)) + BUG(); + if (PageLRU(page)) + BUG(); + if (PageActive(page)) + BUG(); + if (PageDirty(page)) + BUG(); + + break; + } + } while ((entry = entry->next) != local_pages); + } + + nr_pages = current->nr_local_pages; + /* free in reverse order so that the global order will be lifo */ + while ((entry = local_pages->prev) != local_pages) { + list_del(entry); + tmp = list_entry(entry, struct page, list); + __free_pages_ok(tmp, tmp->index); + if (!nr_pages--) + BUG(); + } + current->nr_local_pages = 0; + } + + *freed = __freed; + return page; +} + +static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order) +{ + long free = zone->free_pages - (1UL << order); + return free >= 0 ? free : 0; +} + +/* + * This is the 'heart' of the zoned buddy allocator: + */ +struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) +{ + zone_t **zone, * classzone; + struct page * page; + int freed, class_idx; + + zone = zonelist->zones; + classzone = *zone; + class_idx = zone_idx(classzone); + + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; + + if (zone_free_pages(z, order) > z->watermarks[class_idx].low) { + page = rmqueue(z, order); + if (page) + return page; + } + } + + classzone->need_balance = 1; + mb(); + if (waitqueue_active(&kswapd_wait)) + wake_up_interruptible(&kswapd_wait); + + zone = zonelist->zones; + for (;;) { + unsigned long min; + zone_t *z = *(zone++); + if (!z) + break; + + min = z->watermarks[class_idx].min; + if (!(gfp_mask & __GFP_WAIT)) + min >>= 2; + if (zone_free_pages(z, order) > min) { + page = rmqueue(z, order); + if (page) + return page; + } + } + + /* here we're in the low on memory slow path */ + + if ((current->flags & PF_MEMALLOC) && + (!in_interrupt() || (current->flags & PF_MEMDIE))) { + zone = zonelist->zones; + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; + + page = rmqueue(z, order); + if (page) + return page; + } + return NULL; + } + + /* Atomic allocations - we can't balance anything */ + if (!(gfp_mask & __GFP_WAIT)) + goto out; + + rebalance: + page = balance_classzone(classzone, gfp_mask, order, &freed); + if (page) + return page; + + zone = zonelist->zones; + if (likely(freed)) { + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; + + if (zone_free_pages(z, order) > z->watermarks[class_idx].min) { + page = rmqueue(z, order); + if (page) + return page; + } + } + goto rebalance; + } else { + /* + * Check that no other task is been killed meanwhile, + * in such a case we can succeed the allocation. + */ + for (;;) { + zone_t *z = *(zone++); + if (!z) + break; + + if (zone_free_pages(z, order) > z->watermarks[class_idx].high) { + page = rmqueue(z, order); + if (page) + return page; + } + } + } + + out: + printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n", + order, gfp_mask, !!(current->flags & PF_MEMALLOC)); + if (unlikely(vm_gfp_debug)) + dump_stack(); + return NULL; +} + +/* + * Common helper functions. + */ +unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) +{ + struct page * page; + + page = alloc_pages(gfp_mask, order); + if (!page) + return 0; + return (unsigned long) page_address(page); +} + +unsigned long get_zeroed_page(unsigned int gfp_mask) +{ + struct page * page; + + page = alloc_pages(gfp_mask, 0); + if (page) { + void *address = page_address(page); + clear_page(address); + return (unsigned long) address; + } + return 0; +} + +void __free_pages(struct page *page, unsigned int order) +{ + if (!PageReserved(page) && put_page_testzero(page)) + __free_pages_ok(page, order); +} + +void free_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) + __free_pages(virt_to_page(addr), order); +} + +/* + * Total amount of free (allocatable) RAM: + */ +unsigned int nr_free_pages (void) +{ + unsigned int sum = 0; + zone_t *zone; + + for_each_zone(zone) + sum += zone->free_pages; + + return sum; +} + +/* + * Amount of free RAM allocatable as buffer memory: + */ +unsigned int nr_free_buffer_pages (void) +{ + pg_data_t *pgdat; + unsigned int sum = 0; + zonelist_t *zonelist; + zone_t **zonep, *zone; + + for_each_pgdat(pgdat) { + int class_idx; + zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); + zonep = zonelist->zones; + zone = *zonep; + class_idx = zone_idx(zone); + + sum += zone->nr_cache_pages; + for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { + int free = zone->free_pages - zone->watermarks[class_idx].high; + if (free <= 0) + continue; + sum += free; + } + } + + return sum; +} + +#if CONFIG_HIGHMEM +unsigned int nr_free_highpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_pgdat(pgdat) + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; + + return pages; +} + +unsigned int freeable_lowmem(void) +{ + unsigned int pages = 0; + pg_data_t *pgdat; + + for_each_pgdat(pgdat) { + pages += pgdat->node_zones[ZONE_DMA].free_pages; + pages += pgdat->node_zones[ZONE_DMA].nr_active_pages; + pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages; + pages += pgdat->node_zones[ZONE_NORMAL].free_pages; + pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages; + pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages; + } + + return pages; +} +#endif + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + */ +void show_free_areas_core(pg_data_t *pgdat) +{ + unsigned int order; + unsigned type; + pg_data_t *tmpdat = pgdat; + + printk("Free pages: %6dkB (%6dkB HighMem)\n", + K(nr_free_pages()), + K(nr_free_highpages())); + + while (tmpdat) { + zone_t *zone; + for (zone = tmpdat->node_zones; + zone < tmpdat->node_zones + MAX_NR_ZONES; zone++) + printk("Zone:%s freepages:%6lukB\n", + zone->name, + K(zone->free_pages)); + + tmpdat = tmpdat->node_next; + } + + printk("( Active: %d, inactive: %d, free: %d )\n", + nr_active_pages, + nr_inactive_pages, + nr_free_pages()); + + for (type = 0; type < MAX_NR_ZONES; type++) { + struct list_head *head, *curr; + zone_t *zone = pgdat->node_zones + type; + unsigned long nr, total, flags; + + total = 0; + if (zone->size) { + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + head = &(zone->free_area + order)->free_list; + curr = head; + nr = 0; + for (;;) { + if ((curr = curr->next) == head) + break; + nr++; + } + total += nr * (1 << order); + printk("%lu*%lukB ", nr, K(1UL) << order); + } + spin_unlock_irqrestore(&zone->lock, flags); + } + printk("= %lukB)\n", K(total)); + } + +#ifdef SWAP_CACHE_INFO + show_swap_cache_info(); +#endif +} + +void show_free_areas(void) +{ + show_free_areas_core(pgdat_list); +} + +/* + * Builds allocation fallback zone lists. + */ +static inline void build_zonelists(pg_data_t *pgdat) +{ + int i, j, k; + + for (i = 0; i <= GFP_ZONEMASK; i++) { + zonelist_t *zonelist; + zone_t *zone; + + zonelist = pgdat->node_zonelists + i; + memset(zonelist, 0, sizeof(*zonelist)); + + j = 0; + k = ZONE_NORMAL; + if (i & __GFP_HIGHMEM) + k = ZONE_HIGHMEM; + if (i & __GFP_DMA) + k = ZONE_DMA; + + switch (k) { + default: + BUG(); + /* + * fallthrough: + */ + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->size) { +#ifndef CONFIG_HIGHMEM + BUG(); +#endif + zonelist->zones[j++] = zone; + } + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->size) + zonelist->zones[j++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->size) + zonelist->zones[j++] = zone; + } + zonelist->zones[j++] = NULL; + } +} + +/* + * Helper functions to size the waitqueue hash table. + * Essentially these want to choose hash table sizes sufficiently + * large so that collisions trying to wait on pages are rare. + * But in fact, the number of active page waitqueues on typical + * systems is ridiculously low, less than 200. So this is even + * conservative, even though it seems large. + * + * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to + * waitqueues, i.e. the size of the waitq table given the number of pages. + */ +#define PAGES_PER_WAITQUEUE 256 + +static inline unsigned long wait_table_size(unsigned long pages) +{ + unsigned long size = 1; + + pages /= PAGES_PER_WAITQUEUE; + + while (size < pages) + size <<= 1; + + /* + * Once we have dozens or even hundreds of threads sleeping + * on IO we've got bigger problems than wait queue collision. + * Limit the size of the wait table to a reasonable size. + */ + size = min(size, 4096UL); + + return size; +} + +/* + * This is an integer logarithm so that shifts can be used later + * to extract the more random high bits from the multiplicative + * hash function before the remainder is taken. + */ +static inline unsigned long wait_table_bits(unsigned long size) +{ + return ffz(~size); +} + +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + */ +void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, + unsigned long *zones_size, unsigned long zone_start_paddr, + unsigned long *zholes_size, struct page *lmem_map) +{ + unsigned long i, j; + unsigned long map_size; + unsigned long totalpages, offset, realtotalpages; + const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + + if (zone_start_paddr & ~PAGE_MASK) + BUG(); + + totalpages = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + unsigned long size = zones_size[i]; + totalpages += size; + } + realtotalpages = totalpages; + if (zholes_size) + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= zholes_size[i]; + + printk("On node %d totalpages: %lu\n", nid, realtotalpages); + + /* + * Some architectures (with lots of mem and discontinous memory + * maps) have to search for a good mem_map area: + * For discontigmem, the conceptual mem map array starts from + * PAGE_OFFSET, we need to align the actual array onto a mem map + * boundary, so that MAP_NR works. + */ + map_size = (totalpages + 1)*sizeof(struct page); + if (lmem_map == (struct page *)0) { + lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); + lmem_map = (struct page *)(PAGE_OFFSET + + MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); + } + *gmap = pgdat->node_mem_map = lmem_map; + pgdat->node_size = totalpages; + pgdat->node_start_paddr = zone_start_paddr; + pgdat->node_start_mapnr = (lmem_map - mem_map); + pgdat->nr_zones = 0; + + offset = lmem_map - mem_map; + for (j = 0; j < MAX_NR_ZONES; j++) { + zone_t *zone = pgdat->node_zones + j; + unsigned long mask; + unsigned long size, realsize; + int idx; + + zone_table[nid * MAX_NR_ZONES + j] = zone; + realsize = size = zones_size[j]; + if (zholes_size) + realsize -= zholes_size[j]; + + printk("zone(%lu): %lu pages.\n", j, size); + zone->size = size; + zone->realsize = realsize; + zone->name = zone_names[j]; + zone->lock = SPIN_LOCK_UNLOCKED; + zone->zone_pgdat = pgdat; + zone->free_pages = 0; + zone->need_balance = 0; + zone->nr_active_pages = zone->nr_inactive_pages = 0; + + + if (!size) + continue; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(size); + zone->wait_table_shift = + BITS_PER_LONG - wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); + + pgdat->nr_zones = j+1; + + mask = (realsize / zone_balance_ratio[j]); + if (mask < zone_balance_min[j]) + mask = zone_balance_min[j]; + else if (mask > zone_balance_max[j]) + mask = zone_balance_max[j]; + zone->watermarks[j].min = mask; + zone->watermarks[j].low = mask*2; + zone->watermarks[j].high = mask*3; + /* now set the watermarks of the lower zones in the "j" classzone */ + for (idx = j-1; idx >= 0; idx--) { + zone_t * lower_zone = pgdat->node_zones + idx; + unsigned long lower_zone_reserve; + if (!lower_zone->size) + continue; + + mask = lower_zone->watermarks[idx].min; + lower_zone->watermarks[j].min = mask; + lower_zone->watermarks[j].low = mask*2; + lower_zone->watermarks[j].high = mask*3; + + /* now the brainer part */ + lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx]; + lower_zone->watermarks[j].min += lower_zone_reserve; + lower_zone->watermarks[j].low += lower_zone_reserve; + lower_zone->watermarks[j].high += lower_zone_reserve; + + realsize += lower_zone->realsize; + } + + zone->zone_mem_map = mem_map + offset; + zone->zone_start_mapnr = offset; + zone->zone_start_paddr = zone_start_paddr; + + if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) + printk("BUG: wrong zone alignment, it will crash\n"); + + /* + * Initially all pages are reserved - free ones are freed + * up by free_all_bootmem() once the early boot process is + * done. Non-atomic initialization, single-pass. + */ + for (i = 0; i < size; i++) { + struct page *page = mem_map + offset + i; + set_page_zone(page, nid * MAX_NR_ZONES + j); + set_page_count(page, 0); + SetPageReserved(page); + INIT_LIST_HEAD(&page->list); + if (j != ZONE_HIGHMEM) + set_page_address(page, __va(zone_start_paddr)); + zone_start_paddr += PAGE_SIZE; + } + + offset += size; + for (i = 0; ; i++) { + unsigned long bitmap_size; + + INIT_LIST_HEAD(&zone->free_area[i].free_list); + if (i == MAX_ORDER-1) { + zone->free_area[i].map = NULL; + break; + } + + /* + * Page buddy system uses "index >> (i+1)", + * where "index" is at most "size-1". + * + * The extra "+3" is to round down to byte + * size (8 bits per byte assumption). Thus + * we get "(size-1) >> (i+4)" as the last byte + * we can access. + * + * The "+1" is because we want to round the + * byte allocation up rather than down. So + * we should have had a "+7" before we shifted + * down by three. Also, we have to add one as + * we actually _use_ the last bit (it's [0,n] + * inclusive, not [0,n[). + * + * So we actually had +7+1 before we shift + * down by 3. But (n+8) >> 3 == (n >> 3) + 1 + * (modulo overflows, which we do not have). + * + * Finally, we LONG_ALIGN because all bitmap + * operations are on longs. + */ + bitmap_size = (size-1) >> (i+4); + bitmap_size = LONG_ALIGN(bitmap_size+1); + zone->free_area[i].map = + (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); + } + } + build_zonelists(pgdat); +} + +void __init free_area_init(unsigned long *zones_size) +{ + free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0); +} + +static int __init setup_mem_frac(char *str) +{ + int j = 0; + + while (get_option(&str, &zone_balance_ratio[j++]) == 2); + printk("setup_mem_frac: "); + for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]); + printk("\n"); + return 1; +} + +__setup("memfrac=", setup_mem_frac); + +static int __init setup_lower_zone_reserve(char *str) +{ + int j = 0; + + while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2); + printk("setup_lower_zone_reserve: "); + for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d ", lower_zone_reserve_ratio[j]); + printk("\n"); + return 1; +} + +__setup("lower_zone_reserve=", setup_lower_zone_reserve); |