aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>2004-05-07 14:53:28 +0000
committerkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>2004-05-07 14:53:28 +0000
commit1de448f4c54eac94a966d65e72b15bcbef3a7e5d (patch)
treecc2a452db9507208580071b8676288d9b1dde5ea
parent747a8d04495070f12d625e2047b07eb3967ca9b8 (diff)
downloadxen-1de448f4c54eac94a966d65e72b15bcbef3a7e5d.tar.gz
xen-1de448f4c54eac94a966d65e72b15bcbef3a7e5d.tar.bz2
xen-1de448f4c54eac94a966d65e72b15bcbef3a7e5d.zip
bitkeeper revision 1.891.1.5 (409ba2e8A6F60eP06BqyZUGapsn8XA)
Network interface for new IO model is now completed.
-rw-r--r--.rootkeys3
-rwxr-xr-xtools/examples/xc_dom_create.py13
-rw-r--r--tools/xenctl/lib/utils.py10
-rw-r--r--tools/xend/lib/domain_controller.h1
-rwxr-xr-xtools/xend/lib/main.py25
-rw-r--r--tools/xend/lib/manager.py39
-rw-r--r--tools/xend/lib/netif.py144
-rw-r--r--xen/common/dom_mem_ops.c12
-rw-r--r--xen/common/domain.c4
-rw-r--r--xen/common/kernel.c2
-rw-r--r--xen/common/memory.c12
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/config.in2
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/defconfig2
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev1
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h1
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c3
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c2
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c58
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c4
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h17
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c96
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c2
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c2
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c5
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c42
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c128
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c239
-rw-r--r--xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c83
-rw-r--r--xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c2
-rw-r--r--xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h8
-rw-r--r--xenolinux-2.4.26-sparse/include/asm-xen/io.h39
-rw-r--r--xenolinux-2.4.26-sparse/include/asm-xen/pci.h283
-rwxr-xr-xxenolinux-2.4.26-sparse/mkbuildtree1
-rw-r--r--xenolinux-2.4.26-sparse/mm/page_alloc.c930
34 files changed, 1866 insertions, 349 deletions
diff --git a/.rootkeys b/.rootkeys
index 5a7a5d2803..4c888bbc8e 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -107,6 +107,7 @@
4048c0ddsF0WrU7HUzTvg1MJoCIfWg tools/xend/lib/domain_controller.h
4054a301VEag2GwrBrFBna5U1BGlLA tools/xend/lib/main.py
4055ad9ah9IuC3sJT2c_gYIFY5Tw_g tools/xend/lib/manager.py
+409ba2e729HhE7fEra4B5EqX-F8Xzw tools/xend/lib/netif.py
40431ac8wrUEj-XM7B8smFtx_HA7lQ tools/xend/lib/utils.c
4054a2fdkdATEnRw-U7AUlgu-6JiUA tools/xend/setup.py
4056cd26Qyp09iNoOjrvzg8KYzSqOw tools/xend/xend
@@ -735,6 +736,7 @@
3f8707e7ZmZ6TxyX0ZUEfvhA2Pb_xQ xenolinux-2.4.26-sparse/include/asm-xen/msr.h
3e7270deQqtGPSnFxcW4AvJZuTUWfg xenolinux-2.4.26-sparse/include/asm-xen/multicall.h
3e5a4e67mnQfh-R8KcQCaVo2Oho6yg xenolinux-2.4.26-sparse/include/asm-xen/page.h
+409ba2e7ZfV5hqTvIzxLtpClnxtIzg xenolinux-2.4.26-sparse/include/asm-xen/pci.h
3e5a4e67uTYU5oEnIDjxuaez8njjqg xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h
3e5a4e67X7JyupgdYkgDX19Huj2sAw xenolinux-2.4.26-sparse/include/asm-xen/pgtable-2level.h
3e5a4e67gr4NLGtQ5CvSLimMYZlkOA xenolinux-2.4.26-sparse/include/asm-xen/pgtable.h
@@ -762,6 +764,7 @@
3e5a4e68GxCIaFH4sy01v1wjapetaA xenolinux-2.4.26-sparse/mm/memory.c
3f108af5VxPkLv13tXpXgoRKALQtXQ xenolinux-2.4.26-sparse/mm/mprotect.c
3e5a4e681xMPdF9xCMwpyfuYMySU5g xenolinux-2.4.26-sparse/mm/mremap.c
+409ba2e7akOFqQUg6Qyg2s28xcXiMg xenolinux-2.4.26-sparse/mm/page_alloc.c
3e5a4e683HKVU-sxtagrDasRB8eBVw xenolinux-2.4.26-sparse/mm/swapfile.c
3f108af81Thhb242EmKjGCYkjx-GJA xenolinux-2.4.26-sparse/mm/vmalloc.c
407eb087XaNDLn8thVDLH-rI0hG-Xw xenolinux-sparse
diff --git a/tools/examples/xc_dom_create.py b/tools/examples/xc_dom_create.py
index 22479a9d14..19bb2ac9df 100755
--- a/tools/examples/xc_dom_create.py
+++ b/tools/examples/xc_dom_create.py
@@ -333,7 +333,18 @@ def make_domain():
xc.domain_destroy ( dom=id )
sys.exit()
- if not new_io_world:
+ if new_io_world:
+ cmsg = 'new_network_interface(dom='+str(id)+')'
+ xend_response = xenctl.utils.xend_control_message(cmsg)
+ if not xend_response['success']:
+ print "Error creating network interface"
+ print "Error type: " + xend_response['error_type']
+ if xend_response['error_type'] == 'exception':
+ print "Exception type: " + xend_response['exception_type']
+ print "Exception val: " + xend_response['exception_value']
+ xc.domain_destroy ( dom=id )
+ sys.exit()
+ else:
# setup virtual firewall rules for all aliases
for ip in vfr_ipaddr:
xenctl.utils.setup_vfr_rules_for_vif( id, 0, ip )
diff --git a/tools/xenctl/lib/utils.py b/tools/xenctl/lib/utils.py
index 3f0914f73f..11aadb4f08 100644
--- a/tools/xenctl/lib/utils.py
+++ b/tools/xenctl/lib/utils.py
@@ -54,15 +54,13 @@ def get_current_ipmask(dev='eth0'):
return m.group(1)
return None
-def get_current_ipgw(dev='eth0'):
- """Return a string containing the IP gateway for the given
- network interface (default 'eth0').
- """
+def get_current_ipgw():
+ """Return a string containing the default IP gateway."""
fd = os.popen( '/sbin/route -n' )
lines = fd.readlines()
for line in lines:
- m = re.search( '^\S+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' +
- '\s+\S+\s+\S*G.*' + dev + '.*', line )
+ m = re.search( '^0.0.0.0+\s+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)' +
+ '\s+0.0.0.0+\s+\S*G.*', line )
if m:
return m.group(1)
return None
diff --git a/tools/xend/lib/domain_controller.h b/tools/xend/lib/domain_controller.h
index d5c397fe96..566967dc38 100644
--- a/tools/xend/lib/domain_controller.h
+++ b/tools/xend/lib/domain_controller.h
@@ -468,7 +468,6 @@ typedef struct {
unsigned int evtchn; /* Event channel for notifications. */
unsigned long tx_shmem_frame; /* Page cont. tx shared comms window. */
unsigned long rx_shmem_frame; /* Page cont. rx shared comms window. */
- unsigned long shmem_frame;
/* OUT */
unsigned int status;
} netif_be_connect_t;
diff --git a/tools/xend/lib/main.py b/tools/xend/lib/main.py
index 7b5adbab83..0056783d7f 100755
--- a/tools/xend/lib/main.py
+++ b/tools/xend/lib/main.py
@@ -5,7 +5,7 @@
###########################################################
import errno, re, os, pwd, select, signal, socket, struct, sys, time
-import xend.blkif, xend.console, xend.manager, xend.utils, Xc
+import xend.blkif, xend.netif, xend.console, xend.manager, xend.utils, Xc
# The following parameters could be placed in a configuration file.
@@ -19,6 +19,8 @@ UNIX_SOCK = 'management_sock' # relative to CONTROL_DIR
CMSG_CONSOLE = 0
CMSG_BLKIF_BE = 1
CMSG_BLKIF_FE = 2
+CMSG_NETIF_BE = 3
+CMSG_NETIF_FE = 4
def port_from_dom(dom):
@@ -162,6 +164,10 @@ def daemon_loop():
if xend.blkif.interface.list.has_key(idx):
blk_if = xend.blkif.interface.list[idx]
+ net_if = False
+ if xend.netif.interface.list.has_key(idx):
+ net_if = xend.netif.interface.list[idx]
+
# If we pick up a disconnect notification then we do any necessary
# cleanup.
if type == notifier.EXCEPTION:
@@ -175,6 +181,9 @@ def daemon_loop():
if blk_if:
blk_if.destroy()
del blk_if
+ if net_if:
+ net_if.destroy()
+ del net_if
continue
# Process incoming requests.
@@ -188,6 +197,10 @@ def daemon_loop():
blk_if.ctrlif_rx_req(port, msg)
elif type == CMSG_BLKIF_BE and port == dom0_port:
xend.blkif.backend_rx_req(port, msg)
+ elif type == CMSG_NETIF_FE and net_if:
+ net_if.ctrlif_rx_req(port, msg)
+ elif type == CMSG_NETIF_BE and port == dom0_port:
+ xend.netif.backend_rx_req(port, msg)
else:
port.write_response(msg)
@@ -198,6 +211,8 @@ def daemon_loop():
type = (msg.get_header())['type']
if type == CMSG_BLKIF_BE and port == dom0_port:
xend.blkif.backend_rx_rsp(port, msg)
+ elif type == CMSG_NETIF_BE and port == dom0_port:
+ xend.netif.backend_rx_rsp(port, msg)
# Send console data.
if con_if and con_if.ctrlif_transmit_work(port):
@@ -207,10 +222,18 @@ def daemon_loop():
if blk_if and blk_if.ctrlif_transmit_work(port):
work_done = True
+ # Send netif messages.
+ if net_if and net_if.ctrlif_transmit_work(port):
+ work_done = True
+
# Back-end block-device work.
if port == dom0_port and xend.blkif.backend_do_work(port):
work_done = True
+ # Back-end network-device work.
+ if port == dom0_port and xend.netif.backend_do_work(port):
+ work_done = True
+
# Finally, notify the remote end of any work that we did.
if work_done:
port.notify()
diff --git a/tools/xend/lib/manager.py b/tools/xend/lib/manager.py
index ea7398cd4c..2f15683d66 100644
--- a/tools/xend/lib/manager.py
+++ b/tools/xend/lib/manager.py
@@ -4,7 +4,7 @@
## Copyright (c) 2004, K A Fraser (University of Cambridge)
#############################################################
-import xend.blkif, xend.console, xend.main, xend.utils
+import xend.blkif, xend.netif, xend.console, xend.main, xend.utils
##
@@ -113,3 +113,40 @@ def new_block_device(dom, handle, vdev, pdev, start_sect, nr_sect, readonly):
# Response is deferred until back-end driver sends acknowledgement.
return None
+
+
+##
+## new_network_interface:
+## Create a new network interface for the specified domain @dom.
+##
+def new_network_interface(dom, handle=-1):
+ # By default we create an interface with handle zero.
+ if handle < 0:
+ handle = 0
+
+ # We only support one interface per domain, which must have handle zero.
+ if handle != 0:
+ response = { 'success': False }
+ response['error_type'] = 'Bad handle %d (only handle 0 ' + \
+ 'is supported)' % handle
+ return response
+
+ # Find local event-channel port associated with the specified domain.
+ port = xend.main.port_from_dom(dom)
+ if not port:
+ response = { 'success': False }
+ response['error_type'] = 'Unknown domain %d' % dom
+ return response
+
+ # The interface must not already exist.
+ if xend.netif.interface.list.has_key(port.local_port):
+ response = { 'success': False }
+ response['error_type'] = 'Interface (dom=%d,handle=%d) already ' + \
+ 'exists' % (dom, handle)
+ return response
+
+ # Create the new interface. Initially no virtual devices are attached.
+ xend.netif.interface(dom, port.local_port)
+
+ # Response is deferred until back-end driver sends acknowledgement.
+ return None
diff --git a/tools/xend/lib/netif.py b/tools/xend/lib/netif.py
new file mode 100644
index 0000000000..11756c5e56
--- /dev/null
+++ b/tools/xend/lib/netif.py
@@ -0,0 +1,144 @@
+
+###################################################################
+## xend/netif.py -- Network-interface management functions for Xend
+## Copyright (c) 2004, K A Fraser (University of Cambridge)
+###################################################################
+
+import errno, random, re, os, select, signal, socket, struct, sys
+import xend.main, xend.console, xend.manager, xend.utils, Xc
+
+CMSG_NETIF_BE = 3
+CMSG_NETIF_FE = 4
+CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED = 0
+CMSG_NETIF_FE_DRIVER_STATUS_CHANGED = 32
+CMSG_NETIF_FE_INTERFACE_CONNECT = 33
+CMSG_NETIF_FE_INTERFACE_DISCONNECT = 34
+CMSG_NETIF_BE_CREATE = 0
+CMSG_NETIF_BE_DESTROY = 1
+CMSG_NETIF_BE_CONNECT = 2
+CMSG_NETIF_BE_DISCONNECT = 3
+
+pendmsg = None
+pendaddr = None
+
+def backend_tx_req(msg):
+ port = xend.main.dom0_port
+ if port.space_to_write_request():
+ port.write_request(msg)
+ port.notify()
+ else:
+ xend.netif.pendmsg = msg
+
+def backend_rx_req(port, msg):
+ port.write_response(msg)
+
+def backend_rx_rsp(port, msg):
+ subtype = (msg.get_header())['subtype']
+ print "Received netif-be response, subtype %d" % subtype
+ if subtype == CMSG_NETIF_BE_CREATE:
+ rsp = { 'success': True }
+ xend.main.send_management_response(rsp, xend.netif.pendaddr)
+ elif subtype == CMSG_NETIF_BE_CONNECT:
+ (dom,hnd,evtchn,tx_frame,rx_frame,st) = \
+ struct.unpack("QIILLI", msg.get_payload())
+ netif = interface.list[xend.main.port_from_dom(dom).local_port]
+ msg = xend.utils.message(CMSG_NETIF_FE, \
+ CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0)
+ msg.append_payload(struct.pack("IIIBBBBBBBB",0,2, \
+ netif.evtchn['port2'], \
+ netif.mac[0],netif.mac[1], \
+ netif.mac[2],netif.mac[3], \
+ netif.mac[4],netif.mac[5], \
+ 0,0))
+ netif.ctrlif_tx_req(xend.main.port_list[netif.key], msg)
+
+def backend_do_work(port):
+ global pendmsg
+ if pendmsg and port.space_to_write_request():
+ port.write_request(pendmsg)
+ pendmsg = None
+ return True
+ return False
+
+
+class interface:
+
+ # Dictionary of all network-device interfaces.
+ list = {}
+
+
+ # NB. 'key' is an opaque value that has no meaning in this class.
+ def __init__(self, dom, key):
+ self.dom = dom
+ self.key = key
+ self.pendmsg = None
+
+ # VIFs get a random MAC address with a "special" vendor id.
+ #
+ # NB. The vendor is currently an "obsolete" one that used to belong
+ # to DEC (AA-00-00). Using it is probably a bit rude :-)
+ #
+ # NB2. The first bit of the first random octet is set to zero for
+ # all dynamic MAC addresses. This may allow us to manually specify
+ # MAC addresses for some VIFs with no fear of clashes.
+ self.mac = [ 0xaa, 0x00, 0x00 ]
+ self.mac.append(int(random.random()*128))
+ self.mac.append(int(random.random()*256))
+ self.mac.append(int(random.random()*256))
+
+ interface.list[key] = self
+ msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_CREATE, 0)
+ msg.append_payload(struct.pack("QIBBBBBBBBI",dom,0, \
+ self.mac[0],self.mac[1], \
+ self.mac[2],self.mac[3], \
+ self.mac[4],self.mac[5], \
+ 0,0,0))
+ xend.netif.pendaddr = xend.main.mgmt_req_addr
+ backend_tx_req(msg)
+
+
+ # Completely destroy this interface.
+ def destroy(self):
+ del interface.list[self.key]
+ msg = xend.utils.message(CMSG_NETIF_BE, CMSG_NETIF_BE_DESTROY, 0)
+ msg.append_payload(struct.pack("QII",self.dom,0,0))
+ backend_tx_req(msg)
+
+
+ # The parameter @port is the control-interface event channel. This method
+ # returns True if messages were written to the control interface.
+ def ctrlif_transmit_work(self, port):
+ if self.pendmsg and port.space_to_write_request():
+ port.write_request(self.pendmsg)
+ self.pendmsg = None
+ return True
+ return False
+
+ def ctrlif_tx_req(self, port, msg):
+ if port.space_to_write_request():
+ port.write_request(msg)
+ port.notify()
+ else:
+ self.pendmsg = msg
+
+ def ctrlif_rx_req(self, port, msg):
+ port.write_response(msg)
+ subtype = (msg.get_header())['subtype']
+ if subtype == CMSG_NETIF_FE_DRIVER_STATUS_CHANGED:
+ msg = xend.utils.message(CMSG_NETIF_FE, \
+ CMSG_NETIF_FE_INTERFACE_STATUS_CHANGED, 0)
+ msg.append_payload(struct.pack("IIIBBBBBBBB",0,1,0,self.mac[0], \
+ self.mac[1],self.mac[2], \
+ self.mac[3],self.mac[4], \
+ self.mac[5],0,0))
+ self.ctrlif_tx_req(port, msg)
+ elif subtype == CMSG_NETIF_FE_INTERFACE_CONNECT:
+ (hnd,tx_frame,rx_frame) = struct.unpack("ILL", msg.get_payload())
+ xc = Xc.new()
+ self.evtchn = xc.evtchn_bind_interdomain(dom1=0,dom2=self.dom)
+ msg = xend.utils.message(CMSG_NETIF_BE, \
+ CMSG_NETIF_BE_CONNECT, 0)
+ msg.append_payload(struct.pack("QIILLI",self.dom,0, \
+ self.evtchn['port1'],tx_frame, \
+ rx_frame,0))
+ backend_tx_req(msg)
diff --git a/xen/common/dom_mem_ops.c b/xen/common/dom_mem_ops.c
index 7d596026f9..79d0bb1df1 100644
--- a/xen/common/dom_mem_ops.c
+++ b/xen/common/dom_mem_ops.c
@@ -27,13 +27,21 @@ static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op)
{
/* Leave some slack pages; e.g., for the network. */
if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >>
- (PAGE_SHIFT-10))) )
+ (PAGE_SHIFT-10))) )
+ {
+ DPRINTK("Not enough slack: %u %u\n",
+ free_pfns,
+ SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10));
break;
+ }
/* NB. 'alloc_domain_page' does limit checking on pages per domain. */
if ( unlikely((page = alloc_domain_page(p)) == NULL) )
+ {
+ DPRINTK("Could not allocate a frame\n");
break;
-
+ }
+
/* Inform the domain of the new page's machine address. */
mpfn = (unsigned long)(page - frame_table);
copy_to_user(op.pages, &mpfn, sizeof(mpfn));
diff --git a/xen/common/domain.c b/xen/common/domain.c
index a9c40ae98f..1b8759e912 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -334,6 +334,8 @@ struct pfn_info *alloc_domain_page(struct task_struct *p)
spin_lock(&p->page_list_lock);
if ( unlikely(p->tot_pages >= p->max_pages) )
{
+ DPRINTK("Over-allocation for domain %llu: %u >= %u\n",
+ p->domain, p->tot_pages, p->max_pages);
spin_unlock(&p->page_list_lock);
goto free_and_exit;
}
@@ -884,7 +886,7 @@ int construct_dom0(struct task_struct *p,
page->type_and_flags = 0;
page->count_and_flags = PGC_allocated | 1;
list_add_tail(&page->list, &p->page_list);
- p->tot_pages++;
+ p->tot_pages++; p->max_pages++;
}
mpt_alloc = (vpt_start - v_start) + alloc_start;
diff --git a/xen/common/kernel.c b/xen/common/kernel.c
index 7f814391cf..0d5fa023a1 100644
--- a/xen/common/kernel.c
+++ b/xen/common/kernel.c
@@ -105,7 +105,6 @@ static struct {
void cmain(unsigned long magic, multiboot_info_t *mbi)
{
struct task_struct *new_dom;
- dom0_createdomain_t dom0_params;
unsigned long max_page;
unsigned char *cmdline;
module_t *mod = (module_t *)__va(mbi->mods_addr);
@@ -263,7 +262,6 @@ void cmain(unsigned long magic, multiboot_info_t *mbi)
task_hash[TASK_HASH(IDLE_DOMAIN_ID)] = &idle0_task;
/* Create initial domain 0. */
- dom0_params.memory_kb = opt_dom0_mem;
new_dom = do_createdomain(0, 0);
if ( new_dom == NULL )
panic("Error creating domain 0\n");
diff --git a/xen/common/memory.c b/xen/common/memory.c
index e4d0590a57..5acfae8482 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -940,17 +940,25 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
}
break;
+ /* XXX This function is racey! */
case MMUEXT_REASSIGN_PAGE:
- if ( !IS_PRIV(current) )
+ if ( unlikely(!IS_PRIV(current)) )
{
MEM_LOG("Dom %llu has no privilege to reassign page ownership",
current->domain);
okay = 0;
}
- else if ( percpu_info[cpu].gps != NULL )
+ else if ( likely(percpu_info[cpu].gps != NULL) )
{
+ current->tot_pages--;
+ percpu_info[cpu].gps->tot_pages++;
page->u.domain = percpu_info[cpu].gps;
}
+ else
+ {
+ MEM_LOG("No GPS to reassign pfn %08lx to\n", pfn);
+ okay = 0;
+ }
break;
case MMUEXT_RESET_SUBJECTDOM:
diff --git a/xenolinux-2.4.26-sparse/arch/xen/config.in b/xenolinux-2.4.26-sparse/arch/xen/config.in
index 16fa5e66d4..7f961d8521 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/config.in
+++ b/xenolinux-2.4.26-sparse/arch/xen/config.in
@@ -101,6 +101,8 @@ if [ "$CONFIG_HIGHMEM" = "y" ]; then
bool 'HIGHMEM I/O support' CONFIG_HIGHIO
fi
+define_int CONFIG_FORCE_MAX_ZONEORDER 12
+
#bool 'Symmetric multi-processing support' CONFIG_SMP
#if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then
# define_bool CONFIG_HAVE_DEC_LOCK y
diff --git a/xenolinux-2.4.26-sparse/arch/xen/defconfig b/xenolinux-2.4.26-sparse/arch/xen/defconfig
index eaa9171b1f..013e732c3f 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/defconfig
+++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig
@@ -50,6 +50,7 @@ CONFIG_X86_TSC=y
CONFIG_X86_L1_CACHE_SHIFT=5
CONFIG_NOHIGHMEM=y
# CONFIG_HIGHMEM4G is not set
+CONFIG_FORCE_MAX_ZONEORDER=12
#
# General setup
@@ -156,6 +157,7 @@ CONFIG_IP_NF_TARGET_ULOG=y
# Network testing
#
# CONFIG_NET_PKTGEN is not set
+CONFIG_NETDEVICES=y
#
# Block devices
diff --git a/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev b/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev
index 41b05aaaa7..3be5b50bfa 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev
+++ b/xenolinux-2.4.26-sparse/arch/xen/defconfig-physdev
@@ -51,6 +51,7 @@ CONFIG_X86_TSC=y
CONFIG_X86_L1_CACHE_SHIFT=5
CONFIG_NOHIGHMEM=y
# CONFIG_HIGHMEM4G is not set
+CONFIG_FORCE_MAX_ZONEORDER=12
#
# General setup
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h
index e6004b4a8e..e80435fbbb 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/common.h
@@ -10,6 +10,7 @@
#include <linux/rbtree.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
+#include <linux/blkdev.h>
#include <asm/ctrl_if.h>
#include <asm/io.h>
#include "../blkif.h"
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c
index 0746ecfab0..0b26224651 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/control.c
@@ -74,7 +74,8 @@ void blkif_ctrlif_init(void)
ctrl_msg_t cmsg;
blkif_be_driver_status_changed_t st;
- (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx);
+ (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx,
+ CALLBACK_IN_BLOCKING_CONTEXT);
/* Send a driver-UP notification to the domain controller. */
cmsg.type = CMSG_BLKIF_BE;
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c
index 9acbac35ab..14a6ab324d 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/interface.c
@@ -70,7 +70,7 @@ void blkif_create(blkif_be_create_t *create)
unsigned int handle = create->blkif_handle;
blkif_t **pblkif, *blkif;
- if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_ATOMIC)) == NULL )
+ if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL )
{
DPRINTK("Could not create blkif: out of memory\n");
create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c
index 4b11ad9a8e..eb3e32c75f 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/main.c
@@ -24,17 +24,15 @@
#define MAX_PENDING_REQS 64
#define BATCH_PER_DOMAIN 16
-static struct vm_struct *mmap_vma;
-#define MMAP_PAGES_PER_SEGMENT \
- ((BLKIF_MAX_SEGMENTS_PER_REQUEST >> (PAGE_SHIFT-9)) + 1)
+static unsigned long mmap_vstart;
#define MMAP_PAGES_PER_REQUEST \
- (2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * MMAP_PAGES_PER_SEGMENT)
+ (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
#define MMAP_PAGES \
(MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
-#define MMAP_VADDR(_req,_seg) \
- ((unsigned long)mmap_vma->addr + \
+#define MMAP_VADDR(_req,_seg) \
+ (mmap_vstart + \
((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
- ((_seg) * MMAP_PAGES_PER_SEGMENT * PAGE_SIZE))
+ ((_seg) * PAGE_SIZE))
/*
* Each outstanding request that we've passed to the lower device layers has a
@@ -259,11 +257,13 @@ static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW);
for ( i = 0; i < req->nr_segments; i++ )
{
- if ( (req->buffer_and_sects[i] & ~PAGE_MASK) != (PAGE_SIZE / 512) )
+ /* Make sure the buffer is page-sized. */
+ if ( (blkif_first_sect(req->frame_and_sects[i]) != 0) ||
+ (blkif_last_sect(req->frame_and_sects[i]) != 7) )
goto bad_descriptor;
rc = direct_remap_area_pages(&init_mm,
MMAP_VADDR(pending_idx, i),
- req->buffer_and_sects[i] & PAGE_MASK,
+ req->frame_and_sects[i] & PAGE_MASK,
PAGE_SIZE, prot, blkif->domid);
if ( rc != 0 )
goto bad_descriptor;
@@ -288,15 +288,15 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
struct buffer_head *bh;
int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
- unsigned short nr_sects;
- unsigned long buffer;
+ short nr_sects;
+ unsigned long buffer, fas;
int i, tot_sects, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
pending_req_t *pending_req;
pgprot_t prot;
/* We map virtual scatter/gather segments to physical segments. */
int new_segs, nr_psegs = 0;
- phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
+ phys_seg_t phys_seg[BLKIF_MAX_SEGMENTS_PER_REQUEST + 1];
/* Check that number of segments is sane. */
if ( unlikely(req->nr_segments == 0) ||
@@ -314,17 +314,12 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
*/
for ( i = tot_sects = 0; i < req->nr_segments; i++, tot_sects += nr_sects )
{
- buffer = req->buffer_and_sects[i] & ~0x1FF;
- nr_sects = req->buffer_and_sects[i] & 0x1FF;
+ fas = req->frame_and_sects[i];
+ buffer = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9);
+ nr_sects = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
- if ( unlikely(nr_sects == 0) )
- continue;
-
- if ( unlikely(nr_sects > BLKIF_MAX_SECTORS_PER_SEGMENT) )
- {
- DPRINTK("Too many sectors in segment\n");
+ if ( nr_sects <= 0 )
goto bad_descriptor;
- }
phys_seg[nr_psegs].dev = req->device;
phys_seg[nr_psegs].sector_number = req->sector_number + tot_sects;
@@ -344,7 +339,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
}
nr_psegs += new_segs;
- ASSERT(nr_psegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST*2);
+ ASSERT(nr_psegs <= (BLKIF_MAX_SEGMENTS_PER_REQUEST+1));
}
/* Nonsensical zero-sized request? */
@@ -358,13 +353,10 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
for ( i = 0; i < nr_psegs; i++ )
{
- unsigned long sz = ((phys_seg[i].buffer & ~PAGE_MASK) +
- (phys_seg[i].nr_sects << 9) +
- (PAGE_SIZE - 1)) & PAGE_MASK;
int rc = direct_remap_area_pages(&init_mm,
MMAP_VADDR(pending_idx, i),
phys_seg[i].buffer & PAGE_MASK,
- sz, prot, blkif->domid);
+ PAGE_SIZE, prot, blkif->domid);
if ( rc != 0 )
{
DPRINTK("invalid buffer\n");
@@ -372,6 +364,8 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
MMAP_PAGES_PER_REQUEST * PAGE_SIZE);
goto bad_descriptor;
}
+ phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
+ phys_seg[i].buffer >> PAGE_SHIFT;
}
pending_req = &pending_reqs[pending_idx];
@@ -399,6 +393,7 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
bh->b_rsector = (unsigned long)phys_seg[i].sector_number;
bh->b_data = (char *)MMAP_VADDR(pending_idx, i) +
(phys_seg[i].buffer & ~PAGE_MASK);
+// bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i));
bh->b_end_io = end_block_io_op;
bh->b_private = pending_req;
@@ -456,13 +451,13 @@ static int __init init_module(void)
{
int i;
+ if ( !(start_info.flags & SIF_INITDOMAIN) )
+ return 0;
+
blkif_interface_init();
- if ( (mmap_vma = get_vm_area(MMAP_PAGES * PAGE_SIZE, VM_IOREMAP)) == NULL )
- {
- printk(KERN_WARNING "Could not allocate VMA for blkif backend.\n");
- return -ENOMEM;
- }
+ if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 )
+ BUG();
pending_cons = 0;
pending_prod = MAX_PENDING_REQS;
@@ -484,6 +479,7 @@ static int __init init_module(void)
static void cleanup_module(void)
{
+ BUG();
}
module_init(init_module);
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c
index 19b0b3015d..bb5b6ea743 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/backend/vbd.c
@@ -47,7 +47,7 @@ void vbd_create(blkif_be_vbd_create_t *create)
}
}
- if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_ATOMIC)) == NULL) )
+ if ( unlikely((vbd = kmalloc(sizeof(vbd_t), GFP_KERNEL)) == NULL) )
{
DPRINTK("vbd_create: out of memory\n");
create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
@@ -111,7 +111,7 @@ void vbd_grow(blkif_be_vbd_grow_t *grow)
}
if ( unlikely((x = kmalloc(sizeof(blkif_extent_le_t),
- GFP_ATOMIC)) == NULL) )
+ GFP_KERNEL)) == NULL) )
{
DPRINTK("vbd_grow: out of memory\n");
grow->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h
index 1938f68f8e..0a90744c59 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/blkif.h
@@ -26,19 +26,22 @@
*/
#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
-#define BLKIF_MAX_SECTORS_PER_SEGMENT 16
-
typedef struct {
u8 operation; /* BLKIF_OP_??? */
u8 nr_segments; /* number of segments */
blkif_vdev_t device; /* only for read/write requests */
unsigned long id; /* private guest value, echoed in resp */
blkif_sector_t sector_number; /* start sector idx on disk (r/w only) */
- /* Least 9 bits is 'nr_sects'. High 23 bits is the address. */
- /* We must have '0 <= nr_sects <= BLKIF_MAX_SECTORS_PER_SEGMENT'. */
- unsigned long buffer_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ /* @f_a_s[2:0]=last_sect ; @f_a_s[5:3]=first_sect ; @f_a_s[:12]=frame. */
+ /* @first_sect: first sector in frame to transfer (inclusive). */
+ /* @last_sect: last sector in frame to transfer (inclusive). */
+ /* @frame: machine page frame number. */
+ unsigned long frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
} blkif_request_t;
+#define blkif_first_sect(_fas) (((_fas)>>3)&7)
+#define blkif_last_sect(_fas) ((_fas)&7)
+
typedef struct {
unsigned long id; /* copied from request */
u8 operation; /* copied from request */
@@ -79,8 +82,8 @@ typedef struct {
* @device == unused (zero)
* @id == any value (echoed in response message)
* @sector_num == unused (zero)
- * @buffer_and_sects == list of page-aligned, page-sized buffers.
- * (i.e., nr_sects == 8).
+ * @frame_and_sects == list of page-sized buffers.
+ * (i.e., @first_sect == 0, @last_sect == 7).
*
* The response is a list of vdisk_t elements copied into the out-of-band
* probe buffer. On success the response status field contains the number
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c
index 29cc01d087..63f1aeea26 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/main.c
@@ -24,8 +24,6 @@ typedef unsigned char byte; /* from linux/ide.h */
static unsigned int blkif_state = BLKIF_STATE_CLOSED;
static unsigned int blkif_evtchn, blkif_irq;
-static struct tq_struct blkif_statechange_tq;
-
static int blkif_control_rsp_valid;
static blkif_response_t blkif_control_rsp;
@@ -302,11 +300,18 @@ static int blkif_queue_request(unsigned long id,
struct gendisk *gd;
blkif_request_t *req;
struct buffer_head *bh;
+ unsigned int fsect, lsect;
- if ( unlikely(nr_sectors >= (1<<9)) )
- BUG();
+ fsect = (buffer_ma & ~PAGE_MASK) >> 9;
+ lsect = fsect + nr_sectors - 1;
+
+ /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */
if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) )
BUG();
+ if ( lsect > 7 )
+ BUG();
+
+ buffer_ma &= PAGE_MASK;
if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) )
return 1;
@@ -341,8 +346,9 @@ static int blkif_queue_request(unsigned long id,
bh = (struct buffer_head *)id;
bh->b_reqnext = (struct buffer_head *)req->id;
req->id = id;
- req->buffer_and_sects[req->nr_segments] = buffer_ma | nr_sectors;
- if ( ++req->nr_segments < MAX_BLK_SEGS )
+ req->frame_and_sects[req->nr_segments] =
+ buffer_ma | (fsect<<3) | lsect;
+ if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST )
sg_next_sect += nr_sectors;
else
DISABLE_SCATTERGATHER();
@@ -371,7 +377,7 @@ static int blkif_queue_request(unsigned long id,
req->sector_number = (blkif_sector_t)sector_number;
req->device = device;
req->nr_segments = 1;
- req->buffer_and_sects[0] = buffer_ma | nr_sectors;
+ req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect;
req_prod++;
return 0;
@@ -556,46 +562,11 @@ void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp)
}
-static void blkif_bringup_phase1(void *unused)
+static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
{
ctrl_msg_t cmsg;
blkif_fe_interface_connect_t up;
- /* Move from CLOSED to DISCONNECTED state. */
- blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
- blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
- blkif_state = BLKIF_STATE_DISCONNECTED;
-
- /* Construct an interface-CONNECT message for the domain controller. */
- cmsg.type = CMSG_BLKIF_FE;
- cmsg.subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT;
- cmsg.length = sizeof(blkif_fe_interface_connect_t);
- up.handle = 0;
- up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
- memcpy(cmsg.msg, &up, sizeof(up));
-
- /* Tell the controller to bring up the interface. */
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-}
-
-static void blkif_bringup_phase2(void *unused)
-{
- blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
- (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
-
- /* Probe for discs that are attached to the interface. */
- xlvbd_init();
-
- blkif_state = BLKIF_STATE_CONNECTED;
-
- /* Kick pending requests. */
- spin_lock_irq(&io_request_lock);
- kick_pending_request_queues();
- spin_unlock_irq(&io_request_lock);
-}
-
-static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
-{
if ( status->handle != 0 )
{
printk(KERN_WARNING "Status change on unsupported blkif %d\n",
@@ -617,8 +588,22 @@ static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
" in state %d\n", blkif_state);
break;
}
- blkif_statechange_tq.routine = blkif_bringup_phase1;
- schedule_task(&blkif_statechange_tq);
+
+ /* Move from CLOSED to DISCONNECTED state. */
+ blk_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
+ blk_ring->req_prod = blk_ring->resp_prod = resp_cons = req_prod = 0;
+ blkif_state = BLKIF_STATE_DISCONNECTED;
+
+ /* Construct an interface-CONNECT message for the domain controller. */
+ cmsg.type = CMSG_BLKIF_FE;
+ cmsg.subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT;
+ cmsg.length = sizeof(blkif_fe_interface_connect_t);
+ up.handle = 0;
+ up.shmem_frame = virt_to_machine(blk_ring) >> PAGE_SHIFT;
+ memcpy(cmsg.msg, &up, sizeof(up));
+
+ /* Tell the controller to bring up the interface. */
+ ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
break;
case BLKIF_INTERFACE_STATUS_CONNECTED:
@@ -628,9 +613,20 @@ static void blkif_status_change(blkif_fe_interface_status_changed_t *status)
" in state %d\n", blkif_state);
break;
}
+
blkif_evtchn = status->evtchn;
- blkif_statechange_tq.routine = blkif_bringup_phase2;
- schedule_task(&blkif_statechange_tq);
+ blkif_irq = bind_evtchn_to_irq(blkif_evtchn);
+ (void)request_irq(blkif_irq, blkif_int, 0, "blkif", NULL);
+
+ /* Probe for discs that are attached to the interface. */
+ xlvbd_init();
+
+ blkif_state = BLKIF_STATE_CONNECTED;
+
+ /* Kick pending requests. */
+ spin_lock_irq(&io_request_lock);
+ kick_pending_request_queues();
+ spin_unlock_irq(&io_request_lock);
break;
default:
@@ -675,7 +671,11 @@ int __init xlblk_init(void)
ctrl_msg_t cmsg;
blkif_fe_driver_status_changed_t st;
- (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx);
+ if ( start_info.flags & SIF_INITDOMAIN )
+ return 0;
+
+ (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
+ CALLBACK_IN_BLOCKING_CONTEXT);
/* Send a driver-UP notification to the domain controller. */
cmsg.type = CMSG_BLKIF_FE;
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c
index b26907192a..12ce976cb5 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/blkif/frontend/vbd.c
@@ -67,7 +67,7 @@ static int xlvbd_get_vbd_info(vdisk_t *disk_info)
memset(&req, 0, sizeof(req));
req.operation = BLKIF_OP_PROBE;
req.nr_segments = 1;
- req.buffer_and_sects[0] = virt_to_machine(buf) | (PAGE_SIZE/512);
+ req.frame_and_sects[0] = virt_to_machine(buf) | 7;
blkif_control_send(&req, &rsp);
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c
index e01896385b..244f309467 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/console/console.c
@@ -513,7 +513,7 @@ static int __init xencons_init(void)
}
else
{
- (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx);
+ (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0);
}
printk("Xen virtual console successfully installed\n");
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c
index e0e43ff2cc..cf1b075031 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/control.c
@@ -10,8 +10,6 @@
static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
{
- DPRINTK("Received netif backend message, subtype=%d\n", msg->subtype);
-
switch ( msg->subtype )
{
case CMSG_NETIF_BE_CREATE:
@@ -54,7 +52,8 @@ void netif_ctrlif_init(void)
ctrl_msg_t cmsg;
netif_be_driver_status_changed_t st;
- (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx);
+ (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx,
+ CALLBACK_IN_BLOCKING_CONTEXT);
/* Send a driver-UP notification to the domain controller. */
cmsg.type = CMSG_NETIF_BE;
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c
index 8623d8214b..b6a9cff692 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/interface.c
@@ -7,6 +7,7 @@
*/
#include "common.h"
+#include <linux/rtnetlink.h>
#define NETIF_HASHSZ 1024
#define NETIF_HASH(_d,_h) \
@@ -14,6 +15,7 @@
static netif_t *netif_hash[NETIF_HASHSZ];
static struct net_device *bridge_dev;
+static struct net_bridge *bridge_br;
netif_t *netif_find_by_handle(domid_t domid, unsigned int handle)
{
@@ -36,8 +38,10 @@ void __netif_disconnect_complete(netif_t *netif)
*/
unbind_evtchn_from_irq(netif->evtchn);
vfree(netif->tx); /* Frees netif->rx as well. */
- (void)br_del_if((struct net_bridge *)bridge_dev->priv, netif->dev);
+ rtnl_lock();
+ (void)br_del_if(bridge_br, netif->dev);
(void)dev_close(netif->dev);
+ rtnl_unlock();
/* Construct the deferred response message. */
cmsg.type = CMSG_NETIF_BE;
@@ -73,7 +77,7 @@ void netif_create(netif_be_create_t *create)
struct net_device *dev;
netif_t **pnetif, *netif;
- dev = alloc_netdev(sizeof(netif_t), "netif-be-%d", ether_setup);
+ dev = alloc_netdev(sizeof(netif_t), "nbe-if%d", ether_setup);
if ( dev == NULL )
{
DPRINTK("Could not create netif: out of memory\n");
@@ -111,7 +115,10 @@ void netif_create(netif_be_create_t *create)
dev->hard_start_xmit = netif_be_start_xmit;
dev->get_stats = netif_be_get_stats;
memcpy(dev->dev_addr, create->mac, ETH_ALEN);
-
+
+ /* XXX In bridge mode we should force a different MAC from remote end. */
+ dev->dev_addr[2] ^= 1;
+
if ( register_netdev(dev) != 0 )
{
DPRINTK("Could not register new net device\n");
@@ -225,15 +232,27 @@ void netif_connect(netif_be_connect_t *connect)
netif->status = CONNECTED;
netif_get(netif);
+ rtnl_lock();
+
(void)dev_open(netif->dev);
- (void)br_add_if((struct net_bridge *)bridge_dev->priv, netif->dev);
- /* At this point we try to ensure that eth0 is attached to the bridge. */
+ (void)br_add_if(bridge_br, netif->dev);
+
+ /*
+ * The default config is a very simple binding to eth0.
+ * If eth0 is being used as an IP interface by this OS then someone
+ * must add eth0's IP address to nbe-br, and change the routing table
+ * to refer to nbe-br instead of eth0.
+ */
+ (void)dev_open(bridge_dev);
if ( (eth0_dev = __dev_get_by_name("eth0")) != NULL )
{
(void)dev_open(eth0_dev);
- (void)br_add_if((struct net_bridge *)bridge_dev->priv, eth0_dev);
+ (void)br_add_if(bridge_br, eth0_dev);
}
- (void)request_irq(netif->irq, netif_be_int, 0, "netif-backend", netif);
+
+ rtnl_unlock();
+
+ (void)request_irq(netif->irq, netif_be_int, 0, netif->dev->name, netif);
netif_start_queue(netif->dev);
connect->status = NETIF_BE_STATUS_OKAY;
@@ -271,8 +290,11 @@ int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id)
void netif_interface_init(void)
{
memset(netif_hash, 0, sizeof(netif_hash));
- if ( br_add_bridge("netif-backend") != 0 )
+ if ( br_add_bridge("nbe-br") != 0 )
BUG();
- bridge_dev = __dev_get_by_name("netif-be-bridge");
- (void)dev_open(bridge_dev);
+ bridge_dev = __dev_get_by_name("nbe-br");
+ bridge_br = (struct net_bridge *)bridge_dev->priv;
+ bridge_br->bridge_hello_time = bridge_br->hello_time = 0;
+ bridge_br->bridge_forward_delay = bridge_br->forward_delay = 0;
+ bridge_br->stp_enabled = 0;
}
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c
index 5b84eba9bc..62a4adf27d 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/backend/main.c
@@ -14,7 +14,7 @@
#include <asm/hypervisor-ifs/dom_mem_ops.h>
static void net_tx_action(unsigned long unused);
-static void tx_skb_release(struct sk_buff *skb);
+static void netif_page_release(struct page *page);
static void make_tx_response(netif_t *netif,
u16 id,
s8 st);
@@ -30,13 +30,13 @@ static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0);
#define tx_work_exists(_if) (1)
#define MAX_PENDING_REQS 256
-unsigned long mmap_vstart;
+static unsigned long mmap_vstart;
#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
#define PKT_PROT_LEN (ETH_HLEN + 20)
-/*static pending_req_t pending_reqs[MAX_PENDING_REQS];*/
static u16 pending_id[MAX_PENDING_REQS];
+static netif_t *pending_netif[MAX_PENDING_REQS];
static u16 pending_ring[MAX_PENDING_REQS];
static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
typedef unsigned int PEND_RING_IDX;
@@ -60,8 +60,7 @@ static void __refresh_mfn_list(void)
op.u.increase.pages = mfn_list;
if ( (ret = HYPERVISOR_dom_mem_op(&op)) != MAX_MFN_ALLOC )
{
- printk(KERN_WARNING "Unable to increase memory reservation (%d)\n",
- ret);
+ printk(KERN_ALERT "Unable to increase memory reservation (%d)\n", ret);
BUG();
}
alloc_index = MAX_MFN_ALLOC;
@@ -100,10 +99,10 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
netif_t *netif = (netif_t *)dev->priv;
s8 status = NETIF_RSP_OKAY;
- u16 size, id;
+ u16 size=0, id;
mmu_update_t mmu[6];
pgd_t *pgd; pmd_t *pmd; pte_t *pte;
- unsigned long vdata, new_mfn;
+ unsigned long vdata, mdata=0, new_mfn;
/* Drop the packet if the target domain has no receive buffers. */
if ( (netif->rx_req_cons == netif->rx->req_prod) ||
@@ -126,16 +125,23 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
(((unsigned long)skb->end ^ (unsigned long)skb->head) & PAGE_MASK) ||
((skb->end - skb->head) < (PAGE_SIZE/2)) )
{
- struct sk_buff *nskb = dev_alloc_skb(PAGE_SIZE-1024);
+ struct sk_buff *nskb = alloc_skb(PAGE_SIZE-1024, GFP_ATOMIC);
int hlen = skb->data - skb->head;
+ if ( unlikely(nskb == NULL) )
+ {
+ DPRINTK("DOM%llu couldn't get memory for skb.\n", netif->domid);
+ status = NETIF_RSP_ERROR;
+ goto out;
+ }
skb_reserve(nskb, hlen);
- skb_put(nskb, skb->len);
+ __skb_put(nskb, skb->len);
(void)skb_copy_bits(skb, -hlen, nskb->head, hlen + skb->len);
dev_kfree_skb(skb);
skb = nskb;
}
vdata = (unsigned long)skb->data;
+ mdata = virt_to_machine(vdata);
size = skb->tail - skb->data;
new_mfn = get_new_mfn();
@@ -153,7 +159,7 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
mmu[1].ptr |= MMU_EXTENDED_COMMAND;
mmu[1].val |= MMUEXT_SET_SUBJECTDOM_H;
- mmu[2].ptr = virt_to_machine(vdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
+ mmu[2].ptr = (mdata & PAGE_MASK) | MMU_EXTENDED_COMMAND;
mmu[2].val = MMUEXT_REASSIGN_PAGE;
mmu[3].ptr = MMU_EXTENDED_COMMAND;
@@ -167,6 +173,7 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
if ( unlikely(HYPERVISOR_mmu_update(mmu, 6) < 0) )
{
+ DPRINTK("Failed MMU update transferring to DOM%llu\n", netif->domid);
dealloc_mfn(new_mfn);
status = NETIF_RSP_ERROR;
goto out;
@@ -174,12 +181,12 @@ int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
phys_to_machine_mapping[__pa(vdata) >> PAGE_SHIFT] = new_mfn;
- netif->stats.tx_bytes += size;
- netif->stats.tx_packets++;
+ netif->stats.rx_bytes += size;
+ netif->stats.rx_packets++;
out:
spin_lock(&netif->rx_lock);
- make_rx_response(netif, id, status, virt_to_machine(vdata), size);
+ make_rx_response(netif, id, status, mdata, size);
spin_unlock(&netif->rx_lock);
dev_kfree_skb(skb);
return 0;
@@ -220,6 +227,16 @@ static void add_to_net_schedule_list_tail(netif_t *netif)
spin_unlock(&net_schedule_list_lock);
}
+static inline void netif_schedule_work(netif_t *netif)
+{
+ if ( (netif->tx_req_cons != netif->tx->req_prod) &&
+ ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) )
+ {
+ add_to_net_schedule_list_tail(netif);
+ maybe_schedule_tx_action();
+ }
+}
+
void netif_deschedule(netif_t *netif)
{
remove_from_net_schedule_list(netif);
@@ -229,14 +246,8 @@ void netif_deschedule(netif_t *netif)
static void tx_credit_callback(unsigned long data)
{
netif_t *netif = (netif_t *)data;
-
netif->remaining_credit = netif->credit_bytes;
-
- if ( tx_work_exists(netif) )
- {
- add_to_net_schedule_list_tail(netif);
- maybe_schedule_tx_action();
- }
+ netif_schedule_work(netif);
}
#endif
@@ -249,6 +260,7 @@ static void net_tx_action(unsigned long unused)
u16 pending_idx;
NETIF_RING_IDX i;
pgprot_t prot = __pgprot(_PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED);
+ struct page *page;
while ( (NR_PENDING_REQS < MAX_PENDING_REQS) &&
!list_empty(&net_schedule_list) )
@@ -261,7 +273,7 @@ static void net_tx_action(unsigned long unused)
/* Work to do? */
i = netif->tx_req_cons;
- if ( (i == netif->tx->req_prod) &&
+ if ( (i == netif->tx->req_prod) ||
((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) )
{
netif_put(netif);
@@ -296,7 +308,7 @@ static void net_tx_action(unsigned long unused)
netif->remaining_credit -= tx.size;
#endif
- add_to_net_schedule_list_tail(netif);
+ netif_schedule_work(netif);
if ( unlikely(txreq.size <= PKT_PROT_LEN) ||
unlikely(txreq.size > ETH_FRAME_LEN) )
@@ -335,6 +347,7 @@ static void net_tx_action(unsigned long unused)
if ( unlikely((skb = alloc_skb(PKT_PROT_LEN, GFP_ATOMIC)) == NULL) )
{
+ DPRINTK("Can't allocate a skb in start_xmit.\n");
make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
netif_put(netif);
vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE);
@@ -346,29 +359,29 @@ static void net_tx_action(unsigned long unused)
(void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)),
PKT_PROT_LEN);
- skb->dev = netif->dev;
- skb->protocol = eth_type_trans(skb, skb->dev);
-
+ page = virt_to_page(MMAP_VADDR(pending_idx));
+
/* Append the packet payload as a fragment. */
- skb_shinfo(skb)->frags[0].page =
- virt_to_page(MMAP_VADDR(pending_idx));
- skb_shinfo(skb)->frags[0].size =
- txreq.size - PKT_PROT_LEN;
+ skb_shinfo(skb)->frags[0].page = page;
+ skb_shinfo(skb)->frags[0].size = txreq.size - PKT_PROT_LEN;
skb_shinfo(skb)->frags[0].page_offset =
(txreq.addr + PKT_PROT_LEN) & ~PAGE_MASK;
skb_shinfo(skb)->nr_frags = 1;
skb->data_len = txreq.size - PKT_PROT_LEN;
skb->len += skb->data_len;
+ skb->dev = netif->dev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
/* Destructor information. */
- skb->destructor = tx_skb_release;
- skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page = (struct page *)netif;
- skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size = pending_idx;
+ atomic_set(&page->count, 1);
+ page->mapping = (struct address_space *)netif_page_release;
+ pending_id[pending_idx] = txreq.id;
+ pending_netif[pending_idx] = netif;
- netif->stats.rx_bytes += txreq.size;
- netif->stats.rx_packets++;
+ netif->stats.tx_bytes += txreq.size;
+ netif->stats.tx_packets++;
- pending_id[pending_idx] = txreq.id;
pending_cons++;
netif_rx(skb);
@@ -376,28 +389,34 @@ static void net_tx_action(unsigned long unused)
}
}
-/* Destructor function for tx skbs. */
-static void tx_skb_release(struct sk_buff *skb)
+static void netif_page_release(struct page *page)
{
unsigned long flags;
- netif_t *netif = (netif_t *)skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].page;
- u16 pending_idx = skb_shinfo(skb)->frags[MAX_SKB_FRAGS-1].size;
+ netif_t *netif;
+ u16 pending_idx;
+
+ pending_idx = page - virt_to_page(mmap_vstart);
+
+ netif = pending_netif[pending_idx];
vmfree_area_pages(MMAP_VADDR(pending_idx), PAGE_SIZE);
-
- skb_shinfo(skb)->nr_frags = 0;
-
+
spin_lock(&netif->tx_lock);
make_tx_response(netif, pending_id[pending_idx], NETIF_RSP_OKAY);
spin_unlock(&netif->tx_lock);
-
+
+ /*
+ * Scheduling checks must happen after the above response is posted.
+ * This avoids a possible race with a guest OS on another CPU.
+ */
+ mb();
+ netif_schedule_work(netif);
+
netif_put(netif);
spin_lock_irqsave(&pend_prod_lock, flags);
pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
spin_unlock_irqrestore(&pend_prod_lock, flags);
-
- maybe_schedule_tx_action();
}
#if 0
@@ -493,9 +512,26 @@ static void make_rx_response(netif_t *netif,
static int __init init_module(void)
{
+ int i;
+
+ if ( !(start_info.flags & SIF_INITDOMAIN) )
+ return 0;
+
netif_interface_init();
- mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS);
+
+ if ( (mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS)) == 0 )
+ BUG();
+
+ pending_cons = 0;
+ pending_prod = MAX_PENDING_REQS;
+ for ( i = 0; i < MAX_PENDING_REQS; i++ )
+ pending_ring[i] = i;
+
+ spin_lock_init(&net_schedule_list_lock);
+ INIT_LIST_HEAD(&net_schedule_list);
+
netif_ctrlif_init();
+
return 0;
}
diff --git a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c
index af8e660b7c..cc5ac31e82 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/drivers/netif/frontend/main.c
@@ -25,20 +25,18 @@
#include <net/sock.h>
#include <net/pkt_sched.h>
-#include "../netif.h"
+#include <asm/evtchn.h>
+#include <asm/ctrl_if.h>
+#include <asm/hypervisor-ifs/dom_mem_ops.h>
-static struct tq_struct netif_statechange_tq;
+#include "../netif.h"
#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */
-static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs);
static void network_tx_buf_gc(struct net_device *dev);
static void network_alloc_rx_buffers(struct net_device *dev);
static void cleanup_module(void);
-/* Dynamically-mapped IRQs. */
-static int network_irq, debug_irq;
-
static struct list_head dev_list;
struct net_private
@@ -47,7 +45,7 @@ struct net_private
struct net_device *dev;
struct net_device_stats stats;
- NET_RING_IDX rx_resp_cons, tx_resp_cons;
+ NETIF_RING_IDX rx_resp_cons, tx_resp_cons;
unsigned int tx_full;
netif_tx_interface_t *tx;
@@ -69,8 +67,8 @@ struct net_private
* {tx,rx}_skbs store outstanding skbuffs. The first entry in each
* array is an index into a chain of free entries.
*/
- struct sk_buff *tx_skbs[XENNET_TX_RING_SIZE+1];
- struct sk_buff *rx_skbs[XENNET_RX_RING_SIZE+1];
+ struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1];
+ struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1];
};
/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */
@@ -91,7 +89,7 @@ static struct net_device *find_dev_by_handle(unsigned int handle)
{
np = list_entry(ent, struct net_private, list);
if ( np->handle == handle )
- return np;
+ return np->dev;
}
return NULL;
}
@@ -100,8 +98,7 @@ static struct net_device *find_dev_by_handle(unsigned int handle)
static int network_open(struct net_device *dev)
{
struct net_private *np = dev->priv;
- netop_t netop;
- int i, ret;
+ int i;
if ( np->state != NETIF_STATE_CONNECTED )
return -EINVAL;
@@ -111,15 +108,16 @@ static int network_open(struct net_device *dev)
spin_lock_init(&np->tx_lock);
/* Initialise {tx,rx}_skbs to be a free chain containing every entry. */
- for ( i = 0; i <= XENNET_TX_RING_SIZE; i++ )
+ for ( i = 0; i <= NETIF_TX_RING_SIZE; i++ )
np->tx_skbs[i] = (void *)(i+1);
- for ( i = 0; i <= XENNET_RX_RING_SIZE; i++ )
+ for ( i = 0; i <= NETIF_RX_RING_SIZE; i++ )
np->rx_skbs[i] = (void *)(i+1);
wmb();
np->state = NETIF_STATE_ACTIVE;
network_alloc_rx_buffers(dev);
+ np->rx->event = np->rx_resp_cons + 1;
netif_start_queue(dev);
@@ -131,18 +129,17 @@ static int network_open(struct net_device *dev)
static void network_tx_buf_gc(struct net_device *dev)
{
- NET_RING_IDX i, prod;
+ NETIF_RING_IDX i, prod;
unsigned short id;
struct net_private *np = dev->priv;
struct sk_buff *skb;
- tx_entry_t *tx_ring = np->net_ring->tx_ring;
do {
- prod = np->net_idx->tx_resp_prod;
+ prod = np->tx->resp_prod;
for ( i = np->tx_resp_cons; i != prod; i++ )
{
- id = tx_ring[MASK_NET_TX_IDX(i)].resp.id;
+ id = np->tx->ring[MASK_NET_TX_IDX(i)].resp.id;
skb = np->tx_skbs[id];
ADD_ID_TO_FREELIST(np->tx_skbs, id);
dev_kfree_skb_any(skb);
@@ -158,14 +155,14 @@ static void network_tx_buf_gc(struct net_device *dev)
* in such cases notification from Xen is likely to be the only kick
* that we'll get.
*/
- np->net_idx->tx_event =
- prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1;
+ np->tx->event =
+ prod + ((np->tx->req_prod - prod) >> 1) + 1;
mb();
}
- while ( prod != np->net_idx->tx_resp_prod );
+ while ( prod != np->tx->resp_prod );
if ( np->tx_full &&
- ((np->net_idx->tx_req_prod - prod) < XENNET_TX_RING_SIZE) )
+ ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE) )
{
np->tx_full = 0;
if ( np->state == NETIF_STATE_ACTIVE )
@@ -189,10 +186,14 @@ static void network_alloc_rx_buffers(struct net_device *dev)
unsigned short id;
struct net_private *np = dev->priv;
struct sk_buff *skb;
- netop_t netop;
- NET_RING_IDX i = np->net_idx->rx_req_prod;
-
- if ( unlikely((i - np->rx_resp_cons) == XENNET_RX_RING_SIZE) ||
+ NETIF_RING_IDX i = np->rx->req_prod;
+ dom_mem_op_t op;
+ unsigned long pfn_array[NETIF_RX_RING_SIZE];
+ int ret, nr_pfns = 0;
+ pte_t *pte;
+
+ /* Make sure the batch is large enough to be worthwhile (1/2 ring). */
+ if ( unlikely((i - np->rx_resp_cons) > (NETIF_RX_RING_SIZE/2)) ||
unlikely(np->state != NETIF_STATE_ACTIVE) )
return;
@@ -209,13 +210,13 @@ static void network_alloc_rx_buffers(struct net_device *dev)
id = GET_ID_FROM_FREELIST(np->rx_skbs);
np->rx_skbs[id] = skb;
- np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id = id;
- np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr =
- virt_to_machine(get_ppte(skb->head));
-
- np->rx_bufs_to_notify++;
+ np->rx->ring[MASK_NET_RX_IDX(i)].req.id = id;
+
+ pte = get_ppte(skb->head);
+ pfn_array[nr_pfns++] = pte->pte_low >> PAGE_SHIFT;
+ queue_l1_entry_update(pte, 0);
}
- while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE );
+ while ( (++i - np->rx_resp_cons) != NETIF_RX_RING_SIZE );
/*
* We may have allocated buffers which have entries outstanding in the page
@@ -223,17 +224,16 @@ static void network_alloc_rx_buffers(struct net_device *dev)
*/
flush_page_update_queue();
- np->net_idx->rx_req_prod = i;
- np->net_idx->rx_event = np->rx_resp_cons + 1;
-
- /* Batch Xen notifications. */
- if ( np->rx_bufs_to_notify > (XENNET_RX_RING_SIZE/4) )
+ op.op = MEMOP_RESERVATION_DECREASE;
+ op.u.decrease.size = nr_pfns;
+ op.u.decrease.pages = pfn_array;
+ if ( (ret = HYPERVISOR_dom_mem_op(&op)) != nr_pfns )
{
- netop.cmd = NETOP_PUSH_BUFFERS;
- netop.vif = np->idx;
- (void)HYPERVISOR_net_io_op(&netop);
- np->rx_bufs_to_notify = 0;
+ printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret);
+ BUG();
}
+
+ np->rx->req_prod = i;
}
@@ -241,9 +241,8 @@ static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
unsigned short id;
struct net_private *np = (struct net_private *)dev->priv;
- tx_req_entry_t *tx;
- netop_t netop;
- NET_RING_IDX i;
+ netif_tx_request_t *tx;
+ NETIF_RING_IDX i;
if ( unlikely(np->tx_full) )
{
@@ -262,27 +261,27 @@ static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
memcpy(new_skb->data, skb->data, skb->len);
dev_kfree_skb(skb);
skb = new_skb;
- }
+ }
spin_lock_irq(&np->tx_lock);
- i = np->net_idx->tx_req_prod;
+ i = np->tx->req_prod;
id = GET_ID_FROM_FREELIST(np->tx_skbs);
np->tx_skbs[id] = skb;
- tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req;
+ tx = &np->tx->ring[MASK_NET_TX_IDX(i)].req;
tx->id = id;
- tx->addr = phys_to_machine(virt_to_phys(skb->data));
+ tx->addr = virt_to_machine(skb->data);
tx->size = skb->len;
wmb();
- np->net_idx->tx_req_prod = i + 1;
+ np->tx->req_prod = i + 1;
network_tx_buf_gc(dev);
- if ( (i - np->tx_resp_cons) == (XENNET_TX_RING_SIZE - 1) )
+ if ( (i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1) )
{
np->tx_full = 1;
netif_stop_queue(dev);
@@ -295,12 +294,8 @@ static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
/* Only notify Xen if there are no outstanding responses. */
mb();
- if ( np->net_idx->tx_resp_prod == i )
- {
- netop.cmd = NETOP_PUSH_BUFFERS;
- netop.vif = np->idx;
- (void)HYPERVISOR_net_io_op(&netop);
- }
+ if ( np->tx->resp_prod == i )
+ notify_via_evtchn(np->evtchn);
return 0;
}
@@ -312,22 +307,24 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
struct net_private *np = dev->priv;
unsigned long flags;
struct sk_buff *skb;
- rx_resp_entry_t *rx;
- NET_RING_IDX i;
+ netif_rx_response_t *rx;
+ NETIF_RING_IDX i;
+ mmu_update_t mmu[2];
+ pte_t *pte;
spin_lock_irqsave(&np->tx_lock, flags);
network_tx_buf_gc(dev);
spin_unlock_irqrestore(&np->tx_lock, flags);
again:
- for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ )
+ for ( i = np->rx_resp_cons; i != np->rx->resp_prod; i++ )
{
- rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp;
+ rx = &np->rx->ring[MASK_NET_RX_IDX(i)].resp;
skb = np->rx_skbs[rx->id];
ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
- if ( unlikely(rx->status != RING_STATUS_OK) )
+ if ( unlikely(rx->status <= 0) )
{
/* Gate this error. We get a (valid) slew of them on suspend. */
if ( np->state == NETIF_STATE_ACTIVE )
@@ -336,6 +333,17 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
continue;
}
+ /* Remap the page. */
+ pte = get_ppte(skb->head);
+ mmu[0].ptr = virt_to_machine(pte);
+ mmu[0].val = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
+ mmu[1].ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE;
+ mmu[1].val = __pa(skb->head) >> PAGE_SHIFT;
+ if ( HYPERVISOR_mmu_update(mmu, 2) != 0 )
+ BUG();
+ phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] =
+ rx->addr >> PAGE_SHIFT;
+
/*
* Set up shinfo -- from alloc_skb This was particularily nasty: the
* shared info is hidden at the back of the data area (presumably so it
@@ -348,13 +356,13 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
(*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT;
- skb->data = skb->tail = skb->head + rx->offset;
- skb_put(skb, rx->size);
+ skb->data = skb->tail = skb->head + (rx->addr & ~PAGE_MASK);
+ skb_put(skb, rx->status);
skb->protocol = eth_type_trans(skb, dev);
np->stats.rx_packets++;
- np->stats.rx_bytes += rx->size;
+ np->stats.rx_bytes += rx->status;
netif_rx(skb);
dev->last_rx = jiffies;
}
@@ -362,10 +370,11 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
np->rx_resp_cons = i;
network_alloc_rx_buffers(dev);
+ np->rx->event = np->rx_resp_cons + 1;
/* Deal with hypervisor racing our resetting of rx_event. */
mb();
- if ( np->net_idx->rx_resp_prod != i )
+ if ( np->rx->resp_prod != i )
goto again;
}
@@ -373,16 +382,11 @@ static void netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
static int network_close(struct net_device *dev)
{
struct net_private *np = dev->priv;
- netop_t netop;
netif_stop_queue(np->dev);
- netop.cmd = NETOP_FLUSH_BUFFERS;
- netop.vif = np->idx;
- (void)HYPERVISOR_net_io_op(&netop);
-
- while ( (np->rx_resp_cons != np->net_idx->rx_req_prod) ||
- (np->tx_resp_cons != np->net_idx->tx_req_prod) )
+ while ( (np->rx_resp_cons != np->rx->req_prod) ||
+ (np->tx_resp_cons != np->tx->req_prod) )
{
barrier();
current->state = TASK_INTERRUPTIBLE;
@@ -406,55 +410,12 @@ static struct net_device_stats *network_get_stats(struct net_device *dev)
}
-static void netif_bringup_phase1(void *unused)
+static void netif_status_change(netif_fe_interface_status_changed_t *status)
{
ctrl_msg_t cmsg;
netif_fe_interface_connect_t up;
struct net_device *dev;
struct net_private *np;
-
- dev = find_dev_by_handle(0);
- np = dev->priv;
-
- /* Move from CLOSED to DISCONNECTED state. */
- np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
- np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
- memset(np->tx, 0, PAGE_SIZE);
- memset(np->rx, 0, PAGE_SIZE);
- np->state = NETIF_STATE_DISCONNECTED;
-
- /* Construct an interface-CONNECT message for the domain controller. */
- cmsg.type = CMSG_NETIF_FE;
- cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT;
- cmsg.length = sizeof(netif_fe_interface_connect_t);
- up.handle = 0;
- up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
- up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
- memcpy(cmsg.msg, &up, sizeof(up));
-
- /* Tell the controller to bring up the interface. */
- ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
-}
-
-static void netif_bringup_phase2(void *unused)
-{
- struct net_device *dev;
- struct net_private *np;
-
- dev = find_dev_by_handle(0);
- np = dev->priv;
-
- np->irq = bind_evtchn_to_irq(np->evtchn);
- (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM,
- "netif", dev);
-
- np->state = NETIF_STATE_CONNECTED;
-}
-
-static void netif_status_change(netif_fe_interface_status_changed_t *status)
-{
- struct net_device *dev;
- struct net_private *np;
if ( status->handle != 0 )
{
@@ -470,31 +431,53 @@ static void netif_status_change(netif_fe_interface_status_changed_t *status)
{
case NETIF_INTERFACE_STATUS_DESTROYED:
printk(KERN_WARNING "Unexpected netif-DESTROYED message in state %d\n",
- netif_state);
+ np->state);
break;
case NETIF_INTERFACE_STATUS_DISCONNECTED:
if ( np->state != NETIF_STATE_CLOSED )
{
printk(KERN_WARNING "Unexpected netif-DISCONNECTED message"
- " in state %d\n", netif_state);
+ " in state %d\n", np->state);
break;
}
- netif_statechange_tq.routine = netif_bringup_phase1;
- schedule_task(&netif_statechange_tq);
+
+ /* Move from CLOSED to DISCONNECTED state. */
+ np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL);
+ np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL);
+ memset(np->tx, 0, PAGE_SIZE);
+ memset(np->rx, 0, PAGE_SIZE);
+ np->state = NETIF_STATE_DISCONNECTED;
+
+ /* Construct an interface-CONNECT message for the domain controller. */
+ cmsg.type = CMSG_NETIF_FE;
+ cmsg.subtype = CMSG_NETIF_FE_INTERFACE_CONNECT;
+ cmsg.length = sizeof(netif_fe_interface_connect_t);
+ up.handle = 0;
+ up.tx_shmem_frame = virt_to_machine(np->tx) >> PAGE_SHIFT;
+ up.rx_shmem_frame = virt_to_machine(np->rx) >> PAGE_SHIFT;
+ memcpy(cmsg.msg, &up, sizeof(up));
+
+ /* Tell the controller to bring up the interface. */
+ ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
break;
case NETIF_INTERFACE_STATUS_CONNECTED:
if ( np->state == NETIF_STATE_CLOSED )
{
printk(KERN_WARNING "Unexpected netif-CONNECTED message"
- " in state %d\n", netif_state);
+ " in state %d\n", np->state);
break;
}
- np->evtchn = status->evtchn;
+
memcpy(dev->dev_addr, status->mac, ETH_ALEN);
- netif_statechange_tq.routine = netif_bringup_phase2;
- schedule_task(&netif_statechange_tq);
+
+ np->evtchn = status->evtchn;
+ np->irq = bind_evtchn_to_irq(np->evtchn);
+ (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM,
+ dev->name, dev);
+
+ np->state = NETIF_STATE_CONNECTED;
break;
default:
@@ -532,10 +515,13 @@ static int __init init_module(void)
{
ctrl_msg_t cmsg;
netif_fe_driver_status_changed_t st;
- int i, err;
+ int err;
struct net_device *dev;
struct net_private *np;
+ if ( start_info.flags & SIF_INITDOMAIN )
+ return 0;
+
INIT_LIST_HEAD(&dev_list);
if ( (dev = alloc_etherdev(sizeof(struct net_private))) == NULL )
@@ -562,7 +548,8 @@ static int __init init_module(void)
np->dev = dev;
list_add(&np->list, &dev_list);
- (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx);
+ (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx,
+ CALLBACK_IN_BLOCKING_CONTEXT);
/* Send a driver-UP notification to the domain controller. */
cmsg.type = CMSG_NETIF_FE;
diff --git a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c
index 715f707eb0..19cb9a3326 100644
--- a/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c
+++ b/xenolinux-2.4.26-sparse/arch/xen/kernel/ctrl_if.c
@@ -33,8 +33,19 @@ static struct irqaction ctrl_if_irq_action;
static CONTROL_RING_IDX ctrl_if_tx_resp_cons;
static CONTROL_RING_IDX ctrl_if_rx_req_cons;
-/* Incoming message requests: primary message type -> message handler. */
+/* Incoming message requests. */
+ /* Primary message type -> message handler. */
static ctrl_msg_handler_t ctrl_if_rxmsg_handler[256];
+ /* Primary message type -> callback in process context? */
+static unsigned long ctrl_if_rxmsg_blocking_context[256/sizeof(unsigned long)];
+ /* Is it late enough during bootstrap to use schedule_task()? */
+static int safe_to_schedule_task;
+ /* Passed to schedule_task(). */
+static struct tq_struct ctrl_if_rxmsg_deferred_tq;
+ /* Queue up messages to be handled in process context. */
+static ctrl_msg_t ctrl_if_rxmsg_deferred[CONTROL_RING_SIZE];
+static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_prod;
+static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_cons;
/* Incoming message responses: message identifier -> message handler/id. */
static struct {
@@ -99,22 +110,40 @@ static void __ctrl_if_tx_tasklet(unsigned long data)
}
}
+static void __ctrl_if_rxmsg_deferred(void *unused)
+{
+ ctrl_msg_t *msg;
+
+ while ( ctrl_if_rxmsg_deferred_cons != ctrl_if_rxmsg_deferred_prod )
+ {
+ msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
+ ctrl_if_rxmsg_deferred_cons++)];
+ (*ctrl_if_rxmsg_handler[msg->type])(msg, 0);
+ }
+}
+
static void __ctrl_if_rx_tasklet(unsigned long data)
{
control_if_t *ctrl_if = get_ctrl_if();
- ctrl_msg_t *msg;
+ ctrl_msg_t msg, *pmsg;
while ( ctrl_if_rx_req_cons != ctrl_if->rx_req_prod )
{
- /*
- * We need no locking or barriers here. There will be one and only one
- * response as a result of each callback, so the callback handler
- * doesn't need to worry about the 'msg' being overwritten until:
- * 1. It returns (if the message must persist then it must be copied).
- * 2. A response is sent (the response may overwrite the request).
- */
- msg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)];
- (*ctrl_if_rxmsg_handler[msg->type])(msg, 0);
+ pmsg = &ctrl_if->rx_ring[MASK_CONTROL_IDX(ctrl_if_rx_req_cons++)];
+ memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg));
+ if ( msg.length != 0 )
+ memcpy(msg.msg, pmsg->msg, msg.length);
+ if ( test_bit(msg.type, &ctrl_if_rxmsg_blocking_context) )
+ {
+ pmsg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
+ ctrl_if_rxmsg_deferred_prod++)];
+ memcpy(pmsg, &msg, offsetof(ctrl_msg_t, msg) + msg.length);
+ schedule_task(&ctrl_if_rxmsg_deferred_tq);
+ }
+ else
+ {
+ (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0);
+ }
}
}
@@ -243,22 +272,36 @@ void ctrl_if_send_response(ctrl_msg_t *msg)
ctrl_if_notify_controller();
}
-int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd)
+int ctrl_if_register_receiver(
+ u8 type,
+ ctrl_msg_handler_t hnd,
+ unsigned int flags)
{
- unsigned long flags;
+ unsigned long _flags;
int inuse;
- spin_lock_irqsave(&ctrl_if_lock, flags);
+ spin_lock_irqsave(&ctrl_if_lock, _flags);
inuse = (ctrl_if_rxmsg_handler[type] != ctrl_if_rxmsg_default_handler);
if ( inuse )
+ {
printk(KERN_INFO "Receiver %p already established for control "
"messages of type %d.\n", ctrl_if_rxmsg_handler[type], type);
+ }
else
+ {
ctrl_if_rxmsg_handler[type] = hnd;
+ clear_bit(type, &ctrl_if_rxmsg_blocking_context);
+ if ( flags == CALLBACK_IN_BLOCKING_CONTEXT )
+ {
+ set_bit(type, &ctrl_if_rxmsg_blocking_context);
+ if ( !safe_to_schedule_task )
+ BUG();
+ }
+ }
- spin_unlock_irqrestore(&ctrl_if_lock, flags);
+ spin_unlock_irqrestore(&ctrl_if_lock, _flags);
return !inuse;
}
@@ -326,6 +369,7 @@ void __init ctrl_if_init(void)
for ( i = 0; i < 256; i++ )
ctrl_if_rxmsg_handler[i] = ctrl_if_rxmsg_default_handler;
+ ctrl_if_rxmsg_deferred_tq.routine = __ctrl_if_rxmsg_deferred;
spin_lock_init(&ctrl_if_lock);
@@ -333,6 +377,15 @@ void __init ctrl_if_init(void)
}
+/* This is called after it is safe to call schedule_task(). */
+static int __init ctrl_if_late_setup(void)
+{
+ safe_to_schedule_task = 1;
+ return 0;
+}
+__initcall(ctrl_if_late_setup);
+
+
/*
* !! The following are DANGEROUS FUNCTIONS !!
* Use with care [for example, see xencons_force_flush()].
diff --git a/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c b/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c
index 20a934addd..d219c28403 100644
--- a/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c
+++ b/xenolinux-2.4.26-sparse/drivers/block/ll_rw_blk.c
@@ -1626,7 +1626,7 @@ int __init blk_dev_init(void)
jsfd_init();
#endif
-#ifdef CONFIG_XEN_VBD
+#if defined(CONFIG_XEN_VBD) || defined(CONFIG_XEN_NEWIO)
xlblk_init();
#endif
diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h
index a02e2471ea..5bc6cc22b1 100644
--- a/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h
+++ b/xenolinux-2.4.26-sparse/include/asm-xen/ctrl_if.h
@@ -80,8 +80,14 @@ void ctrl_if_send_response(ctrl_msg_t *msg);
* Register a receiver for typed messages from the domain controller. The
* handler (@hnd) is called for every received message of specified @type.
* Returns TRUE (non-zero) if the handler was successfully registered.
+ * If CALLBACK_IN_BLOCKING CONTEXT is specified in @flags then callbacks will
+ * occur in a context in which it is safe to yield (i.e., process context).
*/
-int ctrl_if_register_receiver(u8 type, ctrl_msg_handler_t hnd);
+#define CALLBACK_IN_BLOCKING_CONTEXT 1
+int ctrl_if_register_receiver(
+ u8 type,
+ ctrl_msg_handler_t hnd,
+ unsigned int flags);
/*
* Unregister a receiver for typed messages from the domain controller. The
diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/io.h b/xenolinux-2.4.26-sparse/include/asm-xen/io.h
index f5243bb6a7..5ab5fe9bfc 100644
--- a/xenolinux-2.4.26-sparse/include/asm-xen/io.h
+++ b/xenolinux-2.4.26-sparse/include/asm-xen/io.h
@@ -159,46 +159,11 @@ extern void iounmap(void *addr);
extern void *bt_ioremap(unsigned long offset, unsigned long size);
extern void bt_iounmap(void *addr, unsigned long size);
-#ifdef CONFIG_XEN_PHYSDEV_ACCESS
-
-#ifdef CONFIG_HIGHMEM
-#error "Highmem is not yet compatible with physical device access"
-#endif
-
-/*
- * The bus translation macros need special care if we are executing device
- * accesses to/from other domains' memory. In these cases the virtual address
- * is actually a temporary mapping in the 'vmalloc' space. The physical
- * address will therefore be >max_low_pfn, and will not have a valid entry
- * in the phys_to_mach mapping table.
- */
-static inline unsigned long phys_to_bus(unsigned long phys)
-{
- extern unsigned long max_pfn;
- pgd_t *pgd; pmd_t *pmd; pte_t *pte;
- void *addr;
- unsigned long bus;
- if ( (phys >> PAGE_SHIFT) < max_pfn )
- return phys_to_machine(phys);
- addr = phys_to_virt(phys);
- pgd = pgd_offset_k( (unsigned long)addr);
- pmd = pmd_offset(pgd, (unsigned long)addr);
- pte = pte_offset(pmd, (unsigned long)addr);
- bus = (pte->pte_low & PAGE_MASK) | (phys & ~PAGE_MASK);
- return bus;
-}
-
-#define virt_to_bus(_x) phys_to_bus(virt_to_phys(_x))
-#define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x))
-#define page_to_bus(_x) phys_to_bus(page_to_phys(_x))
-
-#else
-
#define virt_to_bus(_x) phys_to_machine(virt_to_phys(_x))
#define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x))
#define page_to_bus(_x) phys_to_machine(page_to_phys(_x))
-
-#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
+#define bus_to_phys(_x) machine_to_phys(_x)
+#define bus_to_page(_x) (mem_map + (bus_to_phys(_x) >> PAGE_SHIFT))
/*
* readX/writeX() are used to access memory mapped devices. On some
diff --git a/xenolinux-2.4.26-sparse/include/asm-xen/pci.h b/xenolinux-2.4.26-sparse/include/asm-xen/pci.h
new file mode 100644
index 0000000000..74ae5ba8b1
--- /dev/null
+++ b/xenolinux-2.4.26-sparse/include/asm-xen/pci.h
@@ -0,0 +1,283 @@
+#ifndef __i386_PCI_H
+#define __i386_PCI_H
+
+#include <linux/config.h>
+
+#ifdef __KERNEL__
+
+/* Can be used to override the logic in pci_scan_bus for skipping
+ already-configured bus numbers - to be used for buggy BIOSes
+ or architectures with incomplete PCI setup by the loader */
+
+#ifdef CONFIG_PCI
+extern unsigned int pcibios_assign_all_busses(void);
+#else
+#define pcibios_assign_all_busses() 0
+#endif
+#define pcibios_scan_all_fns() 0
+
+extern unsigned long pci_mem_start;
+#define PCIBIOS_MIN_IO 0x1000
+#define PCIBIOS_MIN_MEM (pci_mem_start)
+
+void pcibios_config_init(void);
+struct pci_bus * pcibios_scan_root(int bus);
+extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value);
+extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
+
+void pcibios_set_master(struct pci_dev *dev);
+void pcibios_penalize_isa_irq(int irq);
+struct irq_routing_table *pcibios_get_irq_routing_table(void);
+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+
+/* Dynamic DMA mapping stuff.
+ * i386 has everything mapped statically.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <asm/scatterlist.h>
+#include <linux/string.h>
+#include <asm/io.h>
+
+struct pci_dev;
+
+/* The networking and block device layers use this boolean for bounce
+ * buffer decisions.
+ */
+#define PCI_DMA_BUS_IS_PHYS (0)
+
+/* Allocate and map kernel buffer using consistent mode DMA for a device.
+ * hwdev should be valid struct pci_dev pointer for PCI devices,
+ * NULL for PCI-like buses (ISA, EISA).
+ * Returns non-NULL cpu-view pointer to the buffer if successful and
+ * sets *dma_addrp to the pci side dma address as well, else *dma_addrp
+ * is undefined.
+ */
+extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size,
+ dma_addr_t *dma_handle);
+
+/* Free and unmap a consistent DMA buffer.
+ * cpu_addr is what was returned from pci_alloc_consistent,
+ * size must be the same as what as passed into pci_alloc_consistent,
+ * and likewise dma_addr must be the same as what *dma_addrp was set to.
+ *
+ * References to the memory and mappings associated with cpu_addr/dma_addr
+ * past this call are illegal.
+ */
+extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
+ void *vaddr, dma_addr_t dma_handle);
+
+/* Map a single buffer of the indicated size for DMA in streaming mode.
+ * The 32-bit bus address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory
+ * until either pci_unmap_single or pci_dma_sync_single is performed.
+ */
+static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr,
+ size_t size, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ out_of_line_bug();
+ flush_write_buffers();
+ return virt_to_bus(ptr);
+}
+
+/* Unmap a single streaming mode DMA translation. The dma_addr and size
+ * must match what was provided for in a previous pci_map_single call. All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guarenteed to see
+ * whatever the device wrote there.
+ */
+static inline void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
+ size_t size, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ out_of_line_bug();
+ /* Nothing to do */
+}
+
+/*
+ * pci_{map,unmap}_single_page maps a kernel page to a dma_addr_t. identical
+ * to pci_map_single, but takes a struct page instead of a virtual address
+ */
+static inline dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page,
+ unsigned long offset, size_t size, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ out_of_line_bug();
+
+ return page_to_bus(page) + offset;
+}
+
+static inline void pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address,
+ size_t size, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ out_of_line_bug();
+ /* Nothing to do */
+}
+
+/* pci_unmap_{page,single} is a nop so... */
+#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
+#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
+#define pci_unmap_addr(PTR, ADDR_NAME) (0)
+#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
+#define pci_unmap_len(PTR, LEN_NAME) (0)
+#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
+
+/* Map a set of buffers described by scatterlist in streaming
+ * mode for DMA. This is the scather-gather version of the
+ * above pci_map_single interface. Here the scatter gather list
+ * elements are each tagged with the appropriate dma address
+ * and length. They are obtained via sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ * DMA address/length pairs than there are SG table elements.
+ * (for example via virtual mapping capabilities)
+ * The routine returns the number of addr/length pairs actually
+ * used, at most nents.
+ *
+ * Device ownership issues as mentioned above for pci_map_single are
+ * the same here.
+ */
+static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
+ int nents, int direction)
+{
+ int i;
+
+ if (direction == PCI_DMA_NONE)
+ out_of_line_bug();
+
+ /*
+ * temporary 2.4 hack
+ */
+ for (i = 0; i < nents; i++ ) {
+ if (sg[i].address && sg[i].page)
+ out_of_line_bug();
+ else if (!sg[i].address && !sg[i].page)
+ out_of_line_bug();
+
+ if (sg[i].address)
+ sg[i].dma_address = virt_to_bus(sg[i].address);
+ else
+ sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset;
+ }
+
+ flush_write_buffers();
+ return nents;
+}
+
+/* Unmap a set of streaming mode DMA translations.
+ * Again, cpu read rules concerning calls here are the same as for
+ * pci_unmap_single() above.
+ */
+static inline void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
+ int nents, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ out_of_line_bug();
+ /* Nothing to do */
+}
+
+/* Make physical memory consistent for a single
+ * streaming mode DMA translation after a transfer.
+ *
+ * If you perform a pci_map_single() but wish to interrogate the
+ * buffer using the cpu, yet do not wish to teardown the PCI dma
+ * mapping, you must call this function before doing so. At the
+ * next point you give the PCI dma address back to the card, the
+ * device again owns the buffer.
+ */
+static inline void pci_dma_sync_single(struct pci_dev *hwdev,
+ dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ out_of_line_bug();
+ flush_write_buffers();
+}
+
+/* Make physical memory consistent for a set of streaming
+ * mode DMA translations after a transfer.
+ *
+ * The same as pci_dma_sync_single but for a scatter-gather list,
+ * same rules and usage.
+ */
+static inline void pci_dma_sync_sg(struct pci_dev *hwdev,
+ struct scatterlist *sg,
+ int nelems, int direction)
+{
+ if (direction == PCI_DMA_NONE)
+ out_of_line_bug();
+ flush_write_buffers();
+}
+
+/* Return whether the given PCI device DMA address mask can
+ * be supported properly. For example, if your device can
+ * only drive the low 24-bits during PCI bus mastering, then
+ * you would pass 0x00ffffff as the mask to this function.
+ */
+static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
+{
+ /*
+ * we fall back to GFP_DMA when the mask isn't all 1s,
+ * so we can't guarantee allocations that must be
+ * within a tighter range than GFP_DMA..
+ */
+ if(mask < 0x00ffffff)
+ return 0;
+
+ return 1;
+}
+
+/* This is always fine. */
+#define pci_dac_dma_supported(pci_dev, mask) (1)
+
+static __inline__ dma64_addr_t
+pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction)
+{
+ return ((dma64_addr_t) page_to_bus(page) +
+ (dma64_addr_t) offset);
+}
+
+static __inline__ struct page *
+pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr)
+{
+ return bus_to_page(dma_addr);
+}
+
+static __inline__ unsigned long
+pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr)
+{
+ return (dma_addr & ~PAGE_MASK);
+}
+
+static __inline__ void
+pci_dac_dma_sync_single(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction)
+{
+ flush_write_buffers();
+}
+
+/* These macros should be used after a pci_map_sg call has been done
+ * to get bus addresses of each of the SG entries and their lengths.
+ * You should only work with the number of sg entries pci_map_sg
+ * returns.
+ */
+#define sg_dma_address(sg) ((sg)->dma_address)
+#define sg_dma_len(sg) ((sg)->length)
+
+/* Return the index of the PCI controller for device. */
+static inline int pci_controller_num(struct pci_dev *dev)
+{
+ return 0;
+}
+
+#define HAVE_PCI_MMAP
+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+ enum pci_mmap_state mmap_state, int write_combine);
+
+#endif /* __KERNEL__ */
+
+#endif /* __i386_PCI_H */
diff --git a/xenolinux-2.4.26-sparse/mkbuildtree b/xenolinux-2.4.26-sparse/mkbuildtree
index 46fe4784ad..2e9f7b9920 100755
--- a/xenolinux-2.4.26-sparse/mkbuildtree
+++ b/xenolinux-2.4.26-sparse/mkbuildtree
@@ -163,7 +163,6 @@ ln -sf ../asm-i386/mtrr.h
ln -sf ../asm-i386/namei.h
ln -sf ../asm-i386/param.h
ln -sf ../asm-i386/parport.h
-ln -sf ../asm-i386/pci.h
ln -sf ../asm-i386/pgtable-3level.h
ln -sf ../asm-i386/poll.h
ln -sf ../asm-i386/posix_types.h
diff --git a/xenolinux-2.4.26-sparse/mm/page_alloc.c b/xenolinux-2.4.26-sparse/mm/page_alloc.c
new file mode 100644
index 0000000000..62ed7751a5
--- /dev/null
+++ b/xenolinux-2.4.26-sparse/mm/page_alloc.c
@@ -0,0 +1,930 @@
+/*
+ * linux/mm/page_alloc.c
+ *
+ * Manages the free list, the system allocates free pages here.
+ * Note that kmalloc() lives in slab.c
+ *
+ * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
+ * Swap reorganised 29.12.95, Stephen Tweedie
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
+ * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapctl.h>
+#include <linux/interrupt.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+int nr_swap_pages;
+int nr_active_pages;
+int nr_inactive_pages;
+LIST_HEAD(inactive_list);
+LIST_HEAD(active_list);
+pg_data_t *pgdat_list;
+
+/*
+ *
+ * The zone_table array is used to look up the address of the
+ * struct zone corresponding to a given zone number (ZONE_DMA,
+ * ZONE_NORMAL, or ZONE_HIGHMEM).
+ */
+zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
+EXPORT_SYMBOL(zone_table);
+
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
+static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
+static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+
+int vm_gfp_debug = 0;
+
+/*
+ * Temporary debugging check.
+ */
+#define BAD_RANGE(zone, page) \
+( \
+ (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \
+ || (((page) - mem_map) < (zone)->zone_start_mapnr) \
+ || ((zone) != page_zone(page)) \
+)
+
+/*
+ * Freeing function for a buddy system allocator.
+ * Contrary to prior comments, this is *NOT* hairy, and there
+ * is no reason for anyone not to understand it.
+ *
+ * The concept of a buddy system is to maintain direct-mapped tables
+ * (containing bit values) for memory blocks of various "orders".
+ * The bottom level table contains the map for the smallest allocatable
+ * units of memory (here, pages), and each level above it describes
+ * pairs of units from the levels below, hence, "buddies".
+ * At a high level, all that happens here is marking the table entry
+ * at the bottom level available, and propagating the changes upward
+ * as necessary, plus some accounting needed to play nicely with other
+ * parts of the VM system.
+ * At each level, we keep one bit for each pair of blocks, which
+ * is set to 1 iff only one of the pair is allocated. So when we
+ * are allocating or freeing one, we can derive the state of the
+ * other. That is, if we allocate a small block, and both were
+ * free, the remainder of the region must be split into blocks.
+ * If a block is freed, and its buddy is also free, then this
+ * triggers coalescing into a block of larger size.
+ *
+ * -- wli
+ */
+
+static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
+static void __free_pages_ok (struct page *page, unsigned int order)
+{
+ unsigned long index, page_idx, mask, flags;
+ free_area_t *area;
+ struct page *base;
+ zone_t *zone;
+
+ /*
+ * Yes, think what happens when other parts of the kernel take
+ * a reference to a page in order to pin it for io. -ben
+ */
+ if (PageLRU(page)) {
+ if (unlikely(in_interrupt()))
+ BUG();
+ lru_cache_del(page);
+ }
+
+ if (page->buffers)
+ BUG();
+ if (page->mapping)
+ return (*(void(*)(struct page *))page->mapping)(page);
+ if (!VALID_PAGE(page))
+ BUG();
+ if (PageLocked(page))
+ BUG();
+ if (PageActive(page))
+ BUG();
+ ClearPageReferenced(page);
+ ClearPageDirty(page);
+
+ if (current->flags & PF_FREE_PAGES)
+ goto local_freelist;
+ back_local_freelist:
+
+ zone = page_zone(page);
+
+ mask = (~0UL) << order;
+ base = zone->zone_mem_map;
+ page_idx = page - base;
+ if (page_idx & ~mask)
+ BUG();
+ index = page_idx >> (1 + order);
+
+ area = zone->free_area + order;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ zone->free_pages -= mask;
+
+ while (mask + (1 << (MAX_ORDER-1))) {
+ struct page *buddy1, *buddy2;
+
+ if (area >= zone->free_area + MAX_ORDER)
+ BUG();
+ if (!__test_and_change_bit(index, area->map))
+ /*
+ * the buddy page is still allocated.
+ */
+ break;
+ /*
+ * Move the buddy up one level.
+ * This code is taking advantage of the identity:
+ * -mask = 1+~mask
+ */
+ buddy1 = base + (page_idx ^ -mask);
+ buddy2 = base + page_idx;
+ if (BAD_RANGE(zone,buddy1))
+ BUG();
+ if (BAD_RANGE(zone,buddy2))
+ BUG();
+
+ list_del(&buddy1->list);
+ mask <<= 1;
+ area++;
+ index >>= 1;
+ page_idx &= mask;
+ }
+ list_add(&(base + page_idx)->list, &area->free_list);
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return;
+
+ local_freelist:
+ if (current->nr_local_pages)
+ goto back_local_freelist;
+ if (in_interrupt())
+ goto back_local_freelist;
+
+ list_add(&page->list, &current->local_pages);
+ page->index = order;
+ current->nr_local_pages++;
+}
+
+#define MARK_USED(index, order, area) \
+ __change_bit((index) >> (1+(order)), (area)->map)
+
+static inline struct page * expand (zone_t *zone, struct page *page,
+ unsigned long index, int low, int high, free_area_t * area)
+{
+ unsigned long size = 1 << high;
+
+ while (high > low) {
+ if (BAD_RANGE(zone,page))
+ BUG();
+ area--;
+ high--;
+ size >>= 1;
+ list_add(&(page)->list, &(area)->free_list);
+ MARK_USED(index, high, area);
+ index += size;
+ page += size;
+ }
+ if (BAD_RANGE(zone,page))
+ BUG();
+ return page;
+}
+
+static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
+static struct page * rmqueue(zone_t *zone, unsigned int order)
+{
+ free_area_t * area = zone->free_area + order;
+ unsigned int curr_order = order;
+ struct list_head *head, *curr;
+ unsigned long flags;
+ struct page *page;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ do {
+ head = &area->free_list;
+ curr = head->next;
+
+ if (curr != head) {
+ unsigned int index;
+
+ page = list_entry(curr, struct page, list);
+ if (BAD_RANGE(zone,page))
+ BUG();
+ list_del(curr);
+ index = page - zone->zone_mem_map;
+ if (curr_order != MAX_ORDER-1)
+ MARK_USED(index, curr_order, area);
+ zone->free_pages -= 1UL << order;
+
+ page = expand(zone, page, index, order, curr_order, area);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ set_page_count(page, 1);
+ if (BAD_RANGE(zone,page))
+ BUG();
+ if (PageLRU(page))
+ BUG();
+ if (PageActive(page))
+ BUG();
+ return page;
+ }
+ curr_order++;
+ area++;
+ } while (curr_order < MAX_ORDER);
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ return NULL;
+}
+
+#ifndef CONFIG_DISCONTIGMEM
+struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+ return __alloc_pages(gfp_mask, order,
+ contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
+}
+#endif
+
+static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
+static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
+{
+ struct page * page = NULL;
+ int __freed;
+
+ if (in_interrupt())
+ BUG();
+
+ current->allocation_order = order;
+ current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
+
+ __freed = try_to_free_pages_zone(classzone, gfp_mask);
+
+ current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
+
+ if (current->nr_local_pages) {
+ struct list_head * entry, * local_pages;
+ struct page * tmp;
+ int nr_pages;
+
+ local_pages = &current->local_pages;
+
+ if (likely(__freed)) {
+ /* pick from the last inserted so we're lifo */
+ entry = local_pages->next;
+ do {
+ tmp = list_entry(entry, struct page, list);
+ if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
+ list_del(entry);
+ current->nr_local_pages--;
+ set_page_count(tmp, 1);
+ page = tmp;
+
+ if (page->buffers)
+ BUG();
+ if (page->mapping)
+ BUG();
+ if (!VALID_PAGE(page))
+ BUG();
+ if (PageLocked(page))
+ BUG();
+ if (PageLRU(page))
+ BUG();
+ if (PageActive(page))
+ BUG();
+ if (PageDirty(page))
+ BUG();
+
+ break;
+ }
+ } while ((entry = entry->next) != local_pages);
+ }
+
+ nr_pages = current->nr_local_pages;
+ /* free in reverse order so that the global order will be lifo */
+ while ((entry = local_pages->prev) != local_pages) {
+ list_del(entry);
+ tmp = list_entry(entry, struct page, list);
+ __free_pages_ok(tmp, tmp->index);
+ if (!nr_pages--)
+ BUG();
+ }
+ current->nr_local_pages = 0;
+ }
+
+ *freed = __freed;
+ return page;
+}
+
+static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
+{
+ long free = zone->free_pages - (1UL << order);
+ return free >= 0 ? free : 0;
+}
+
+/*
+ * This is the 'heart' of the zoned buddy allocator:
+ */
+struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
+{
+ zone_t **zone, * classzone;
+ struct page * page;
+ int freed, class_idx;
+
+ zone = zonelist->zones;
+ classzone = *zone;
+ class_idx = zone_idx(classzone);
+
+ for (;;) {
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
+
+ if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ }
+
+ classzone->need_balance = 1;
+ mb();
+ if (waitqueue_active(&kswapd_wait))
+ wake_up_interruptible(&kswapd_wait);
+
+ zone = zonelist->zones;
+ for (;;) {
+ unsigned long min;
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
+
+ min = z->watermarks[class_idx].min;
+ if (!(gfp_mask & __GFP_WAIT))
+ min >>= 2;
+ if (zone_free_pages(z, order) > min) {
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ }
+
+ /* here we're in the low on memory slow path */
+
+ if ((current->flags & PF_MEMALLOC) &&
+ (!in_interrupt() || (current->flags & PF_MEMDIE))) {
+ zone = zonelist->zones;
+ for (;;) {
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
+
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ return NULL;
+ }
+
+ /* Atomic allocations - we can't balance anything */
+ if (!(gfp_mask & __GFP_WAIT))
+ goto out;
+
+ rebalance:
+ page = balance_classzone(classzone, gfp_mask, order, &freed);
+ if (page)
+ return page;
+
+ zone = zonelist->zones;
+ if (likely(freed)) {
+ for (;;) {
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
+
+ if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ }
+ goto rebalance;
+ } else {
+ /*
+ * Check that no other task is been killed meanwhile,
+ * in such a case we can succeed the allocation.
+ */
+ for (;;) {
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
+
+ if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ }
+ }
+
+ out:
+ printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
+ order, gfp_mask, !!(current->flags & PF_MEMALLOC));
+ if (unlikely(vm_gfp_debug))
+ dump_stack();
+ return NULL;
+}
+
+/*
+ * Common helper functions.
+ */
+unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
+{
+ struct page * page;
+
+ page = alloc_pages(gfp_mask, order);
+ if (!page)
+ return 0;
+ return (unsigned long) page_address(page);
+}
+
+unsigned long get_zeroed_page(unsigned int gfp_mask)
+{
+ struct page * page;
+
+ page = alloc_pages(gfp_mask, 0);
+ if (page) {
+ void *address = page_address(page);
+ clear_page(address);
+ return (unsigned long) address;
+ }
+ return 0;
+}
+
+void __free_pages(struct page *page, unsigned int order)
+{
+ if (!PageReserved(page) && put_page_testzero(page))
+ __free_pages_ok(page, order);
+}
+
+void free_pages(unsigned long addr, unsigned int order)
+{
+ if (addr != 0)
+ __free_pages(virt_to_page(addr), order);
+}
+
+/*
+ * Total amount of free (allocatable) RAM:
+ */
+unsigned int nr_free_pages (void)
+{
+ unsigned int sum = 0;
+ zone_t *zone;
+
+ for_each_zone(zone)
+ sum += zone->free_pages;
+
+ return sum;
+}
+
+/*
+ * Amount of free RAM allocatable as buffer memory:
+ */
+unsigned int nr_free_buffer_pages (void)
+{
+ pg_data_t *pgdat;
+ unsigned int sum = 0;
+ zonelist_t *zonelist;
+ zone_t **zonep, *zone;
+
+ for_each_pgdat(pgdat) {
+ int class_idx;
+ zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
+ zonep = zonelist->zones;
+ zone = *zonep;
+ class_idx = zone_idx(zone);
+
+ sum += zone->nr_cache_pages;
+ for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+ int free = zone->free_pages - zone->watermarks[class_idx].high;
+ if (free <= 0)
+ continue;
+ sum += free;
+ }
+ }
+
+ return sum;
+}
+
+#if CONFIG_HIGHMEM
+unsigned int nr_free_highpages (void)
+{
+ pg_data_t *pgdat;
+ unsigned int pages = 0;
+
+ for_each_pgdat(pgdat)
+ pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
+
+ return pages;
+}
+
+unsigned int freeable_lowmem(void)
+{
+ unsigned int pages = 0;
+ pg_data_t *pgdat;
+
+ for_each_pgdat(pgdat) {
+ pages += pgdat->node_zones[ZONE_DMA].free_pages;
+ pages += pgdat->node_zones[ZONE_DMA].nr_active_pages;
+ pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages;
+ pages += pgdat->node_zones[ZONE_NORMAL].free_pages;
+ pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages;
+ pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages;
+ }
+
+ return pages;
+}
+#endif
+
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+/*
+ * Show free area list (used inside shift_scroll-lock stuff)
+ * We also calculate the percentage fragmentation. We do this by counting the
+ * memory on each free list with the exception of the first item on the list.
+ */
+void show_free_areas_core(pg_data_t *pgdat)
+{
+ unsigned int order;
+ unsigned type;
+ pg_data_t *tmpdat = pgdat;
+
+ printk("Free pages: %6dkB (%6dkB HighMem)\n",
+ K(nr_free_pages()),
+ K(nr_free_highpages()));
+
+ while (tmpdat) {
+ zone_t *zone;
+ for (zone = tmpdat->node_zones;
+ zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
+ printk("Zone:%s freepages:%6lukB\n",
+ zone->name,
+ K(zone->free_pages));
+
+ tmpdat = tmpdat->node_next;
+ }
+
+ printk("( Active: %d, inactive: %d, free: %d )\n",
+ nr_active_pages,
+ nr_inactive_pages,
+ nr_free_pages());
+
+ for (type = 0; type < MAX_NR_ZONES; type++) {
+ struct list_head *head, *curr;
+ zone_t *zone = pgdat->node_zones + type;
+ unsigned long nr, total, flags;
+
+ total = 0;
+ if (zone->size) {
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ head = &(zone->free_area + order)->free_list;
+ curr = head;
+ nr = 0;
+ for (;;) {
+ if ((curr = curr->next) == head)
+ break;
+ nr++;
+ }
+ total += nr * (1 << order);
+ printk("%lu*%lukB ", nr, K(1UL) << order);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+ printk("= %lukB)\n", K(total));
+ }
+
+#ifdef SWAP_CACHE_INFO
+ show_swap_cache_info();
+#endif
+}
+
+void show_free_areas(void)
+{
+ show_free_areas_core(pgdat_list);
+}
+
+/*
+ * Builds allocation fallback zone lists.
+ */
+static inline void build_zonelists(pg_data_t *pgdat)
+{
+ int i, j, k;
+
+ for (i = 0; i <= GFP_ZONEMASK; i++) {
+ zonelist_t *zonelist;
+ zone_t *zone;
+
+ zonelist = pgdat->node_zonelists + i;
+ memset(zonelist, 0, sizeof(*zonelist));
+
+ j = 0;
+ k = ZONE_NORMAL;
+ if (i & __GFP_HIGHMEM)
+ k = ZONE_HIGHMEM;
+ if (i & __GFP_DMA)
+ k = ZONE_DMA;
+
+ switch (k) {
+ default:
+ BUG();
+ /*
+ * fallthrough:
+ */
+ case ZONE_HIGHMEM:
+ zone = pgdat->node_zones + ZONE_HIGHMEM;
+ if (zone->size) {
+#ifndef CONFIG_HIGHMEM
+ BUG();
+#endif
+ zonelist->zones[j++] = zone;
+ }
+ case ZONE_NORMAL:
+ zone = pgdat->node_zones + ZONE_NORMAL;
+ if (zone->size)
+ zonelist->zones[j++] = zone;
+ case ZONE_DMA:
+ zone = pgdat->node_zones + ZONE_DMA;
+ if (zone->size)
+ zonelist->zones[j++] = zone;
+ }
+ zonelist->zones[j++] = NULL;
+ }
+}
+
+/*
+ * Helper functions to size the waitqueue hash table.
+ * Essentially these want to choose hash table sizes sufficiently
+ * large so that collisions trying to wait on pages are rare.
+ * But in fact, the number of active page waitqueues on typical
+ * systems is ridiculously low, less than 200. So this is even
+ * conservative, even though it seems large.
+ *
+ * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
+ * waitqueues, i.e. the size of the waitq table given the number of pages.
+ */
+#define PAGES_PER_WAITQUEUE 256
+
+static inline unsigned long wait_table_size(unsigned long pages)
+{
+ unsigned long size = 1;
+
+ pages /= PAGES_PER_WAITQUEUE;
+
+ while (size < pages)
+ size <<= 1;
+
+ /*
+ * Once we have dozens or even hundreds of threads sleeping
+ * on IO we've got bigger problems than wait queue collision.
+ * Limit the size of the wait table to a reasonable size.
+ */
+ size = min(size, 4096UL);
+
+ return size;
+}
+
+/*
+ * This is an integer logarithm so that shifts can be used later
+ * to extract the more random high bits from the multiplicative
+ * hash function before the remainder is taken.
+ */
+static inline unsigned long wait_table_bits(unsigned long size)
+{
+ return ffz(~size);
+}
+
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
+/*
+ * Set up the zone data structures:
+ * - mark all pages reserved
+ * - mark all memory queues empty
+ * - clear the memory bitmaps
+ */
+void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
+ unsigned long *zones_size, unsigned long zone_start_paddr,
+ unsigned long *zholes_size, struct page *lmem_map)
+{
+ unsigned long i, j;
+ unsigned long map_size;
+ unsigned long totalpages, offset, realtotalpages;
+ const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
+
+ if (zone_start_paddr & ~PAGE_MASK)
+ BUG();
+
+ totalpages = 0;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ unsigned long size = zones_size[i];
+ totalpages += size;
+ }
+ realtotalpages = totalpages;
+ if (zholes_size)
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ realtotalpages -= zholes_size[i];
+
+ printk("On node %d totalpages: %lu\n", nid, realtotalpages);
+
+ /*
+ * Some architectures (with lots of mem and discontinous memory
+ * maps) have to search for a good mem_map area:
+ * For discontigmem, the conceptual mem map array starts from
+ * PAGE_OFFSET, we need to align the actual array onto a mem map
+ * boundary, so that MAP_NR works.
+ */
+ map_size = (totalpages + 1)*sizeof(struct page);
+ if (lmem_map == (struct page *)0) {
+ lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
+ lmem_map = (struct page *)(PAGE_OFFSET +
+ MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
+ }
+ *gmap = pgdat->node_mem_map = lmem_map;
+ pgdat->node_size = totalpages;
+ pgdat->node_start_paddr = zone_start_paddr;
+ pgdat->node_start_mapnr = (lmem_map - mem_map);
+ pgdat->nr_zones = 0;
+
+ offset = lmem_map - mem_map;
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ zone_t *zone = pgdat->node_zones + j;
+ unsigned long mask;
+ unsigned long size, realsize;
+ int idx;
+
+ zone_table[nid * MAX_NR_ZONES + j] = zone;
+ realsize = size = zones_size[j];
+ if (zholes_size)
+ realsize -= zholes_size[j];
+
+ printk("zone(%lu): %lu pages.\n", j, size);
+ zone->size = size;
+ zone->realsize = realsize;
+ zone->name = zone_names[j];
+ zone->lock = SPIN_LOCK_UNLOCKED;
+ zone->zone_pgdat = pgdat;
+ zone->free_pages = 0;
+ zone->need_balance = 0;
+ zone->nr_active_pages = zone->nr_inactive_pages = 0;
+
+
+ if (!size)
+ continue;
+
+ /*
+ * The per-page waitqueue mechanism uses hashed waitqueues
+ * per zone.
+ */
+ zone->wait_table_size = wait_table_size(size);
+ zone->wait_table_shift =
+ BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
+ zone->wait_table = (wait_queue_head_t *)
+ alloc_bootmem_node(pgdat, zone->wait_table_size
+ * sizeof(wait_queue_head_t));
+
+ for(i = 0; i < zone->wait_table_size; ++i)
+ init_waitqueue_head(zone->wait_table + i);
+
+ pgdat->nr_zones = j+1;
+
+ mask = (realsize / zone_balance_ratio[j]);
+ if (mask < zone_balance_min[j])
+ mask = zone_balance_min[j];
+ else if (mask > zone_balance_max[j])
+ mask = zone_balance_max[j];
+ zone->watermarks[j].min = mask;
+ zone->watermarks[j].low = mask*2;
+ zone->watermarks[j].high = mask*3;
+ /* now set the watermarks of the lower zones in the "j" classzone */
+ for (idx = j-1; idx >= 0; idx--) {
+ zone_t * lower_zone = pgdat->node_zones + idx;
+ unsigned long lower_zone_reserve;
+ if (!lower_zone->size)
+ continue;
+
+ mask = lower_zone->watermarks[idx].min;
+ lower_zone->watermarks[j].min = mask;
+ lower_zone->watermarks[j].low = mask*2;
+ lower_zone->watermarks[j].high = mask*3;
+
+ /* now the brainer part */
+ lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
+ lower_zone->watermarks[j].min += lower_zone_reserve;
+ lower_zone->watermarks[j].low += lower_zone_reserve;
+ lower_zone->watermarks[j].high += lower_zone_reserve;
+
+ realsize += lower_zone->realsize;
+ }
+
+ zone->zone_mem_map = mem_map + offset;
+ zone->zone_start_mapnr = offset;
+ zone->zone_start_paddr = zone_start_paddr;
+
+ if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
+ printk("BUG: wrong zone alignment, it will crash\n");
+
+ /*
+ * Initially all pages are reserved - free ones are freed
+ * up by free_all_bootmem() once the early boot process is
+ * done. Non-atomic initialization, single-pass.
+ */
+ for (i = 0; i < size; i++) {
+ struct page *page = mem_map + offset + i;
+ set_page_zone(page, nid * MAX_NR_ZONES + j);
+ set_page_count(page, 0);
+ SetPageReserved(page);
+ INIT_LIST_HEAD(&page->list);
+ if (j != ZONE_HIGHMEM)
+ set_page_address(page, __va(zone_start_paddr));
+ zone_start_paddr += PAGE_SIZE;
+ }
+
+ offset += size;
+ for (i = 0; ; i++) {
+ unsigned long bitmap_size;
+
+ INIT_LIST_HEAD(&zone->free_area[i].free_list);
+ if (i == MAX_ORDER-1) {
+ zone->free_area[i].map = NULL;
+ break;
+ }
+
+ /*
+ * Page buddy system uses "index >> (i+1)",
+ * where "index" is at most "size-1".
+ *
+ * The extra "+3" is to round down to byte
+ * size (8 bits per byte assumption). Thus
+ * we get "(size-1) >> (i+4)" as the last byte
+ * we can access.
+ *
+ * The "+1" is because we want to round the
+ * byte allocation up rather than down. So
+ * we should have had a "+7" before we shifted
+ * down by three. Also, we have to add one as
+ * we actually _use_ the last bit (it's [0,n]
+ * inclusive, not [0,n[).
+ *
+ * So we actually had +7+1 before we shift
+ * down by 3. But (n+8) >> 3 == (n >> 3) + 1
+ * (modulo overflows, which we do not have).
+ *
+ * Finally, we LONG_ALIGN because all bitmap
+ * operations are on longs.
+ */
+ bitmap_size = (size-1) >> (i+4);
+ bitmap_size = LONG_ALIGN(bitmap_size+1);
+ zone->free_area[i].map =
+ (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
+ }
+ }
+ build_zonelists(pgdat);
+}
+
+void __init free_area_init(unsigned long *zones_size)
+{
+ free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
+}
+
+static int __init setup_mem_frac(char *str)
+{
+ int j = 0;
+
+ while (get_option(&str, &zone_balance_ratio[j++]) == 2);
+ printk("setup_mem_frac: ");
+ for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]);
+ printk("\n");
+ return 1;
+}
+
+__setup("memfrac=", setup_mem_frac);
+
+static int __init setup_lower_zone_reserve(char *str)
+{
+ int j = 0;
+
+ while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
+ printk("setup_lower_zone_reserve: ");
+ for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d ", lower_zone_reserve_ratio[j]);
+ printk("\n");
+ return 1;
+}
+
+__setup("lower_zone_reserve=", setup_lower_zone_reserve);