bitkeeper revision 1.1662.1.15 (42a5968eiZE_DjdIFPjxvzLw6ACvCQ)

Add xenstore daemon and library. Makefile: Add xenstore subdirectory. Remove xs_stress on clean. Many files: new file ignore: Update ignore list for xenstore. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (authored) Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
author: cl349@firebug.cl.cam.ac.uk <cl349@firebug.cl.cam.ac.uk> 2005-06-07 12:43:58 +0000
committer: cl349@firebug.cl.cam.ac.uk <cl349@firebug.cl.cam.ac.uk> 2005-06-07 12:43:58 +0000
commit: 29c9e570b1eddfd6df789e08da65cf4ddec5f6fe (patch)
tree: bf79ad3040d05ee9e05a60df3b8a364fcfa236dc
parent: 636a81e9701d001f4c9108f722014f48f59eabbd (diff)
download: xen-29c9e570b1eddfd6df789e08da65cf4ddec5f6fe.tar.gz
xen-29c9e570b1eddfd6df789e08da65cf4ddec5f6fe.tar.bz2
xen-29c9e570b1eddfd6df789e08da65cf4ddec5f6fe.zip
40 files changed, 9230 insertions, 1 deletions
diff --git a/.rootkeys b/.rootkeys
index 77b06993c5..a14deeaee8 100644
--- a/.rootkeys
+++ b/.rootkeys
@@ -996,6 +996,43 @@
 4292540couq-V0TPwyQ6bspNEWNcvw tools/xcutils/Makefile
 42925407VysDb9O06OK_RUzTZxfLoA tools/xcutils/xc_restore.c
 42936745WTLYamYsmXm_JGJ72JX-_Q tools/xcutils/xc_save.c
+42a57d97mxMTlPnxBKep6R4ViI5rjg tools/xenstore/.gdbinit
+42a57d97ZEoHuhMAFTuBMlLzA9v_ng tools/xenstore/Makefile
+42a57d97ccA4uY-RxONvIH0P8U0gqg tools/xenstore/TODO
+42a57d972RzmyLgsoH9b8qqk-UjcCA tools/xenstore/fake_libxc.c
+42a57d97IjoPvbIVc4BUzwoKyM0VSw tools/xenstore/list.h
+42a57d97fKgtf0HQLiQkAkVsOvuSyA tools/xenstore/talloc.c
+42a57d98U3p0XP6xzCybTuaVQscUdw tools/xenstore/talloc.h
+42a57d98LFN6Mug-uR4xgAxCE7lwUg tools/xenstore/talloc_guide.txt
+42a57d98S69vKJYwO_WUjoFQZ6KzQg tools/xenstore/testsuite/01simple.sh
+42a57d98BHcFpZz_fXHweylUEUU97Q tools/xenstore/testsuite/02directory.sh
+42a57d98ua4Xeb6pmtbFNTAI833dyw tools/xenstore/testsuite/03write.sh
+42a57d98nbuCUsVT0RJj1zA1JyMDsw tools/xenstore/testsuite/04rm.sh
+42a57d98_ULKHP3_uX1PK2nPMTzWSQ tools/xenstore/testsuite/05filepermissions.sh
+42a57d98YGCLyTDSGmoyFqRqQUlagQ tools/xenstore/testsuite/06dirpermissions.sh
+42a57d98fdO519YyATk4_Zwr1STNfQ tools/xenstore/testsuite/07watch.sh
+42a57d98zZUtvirUMjmHxFphJjmO7Q tools/xenstore/testsuite/08transaction.sh
+42a57d98sn9RbpBgHRv1D99Kt7LwYA tools/xenstore/testsuite/09domain.sh
+42a57d98tSuoFCHnnM2GgENXJrRQmw tools/xenstore/testsuite/test.sh
+42a57d98zxDP2Ti7dTznGROi66rUGw tools/xenstore/utils.c
+42a57d98SDvOYCEjmCjwHSk6390GLA tools/xenstore/utils.h
+42a57d98hFKbOY9D0mCE4H4NDoKr1w tools/xenstore/xenstored.h
+42a57d981KFHLmJ0CjKkn1_gZhYvdw tools/xenstore/xenstored_core.c
+42a57d98bcgE13vYaFxGTusmWbrFDA tools/xenstore/xenstored_core.h
+42a57d98cD9wOFyRYfaEP0QgtqL1Xw tools/xenstore/xenstored_domain.c
+42a57d98noLWvXU8ePbcqvvmu4p2Gw tools/xenstore/xenstored_domain.h
+42a57d98kxHaQ1ApS7RpqmFoEnDmbg tools/xenstore/xenstored_test.h
+42a57d981c9P3aFkWtxWEIRUapt_FQ tools/xenstore/xenstored_transaction.c
+42a57d99pVo__10bbckp_b_rm6i59A tools/xenstore/xenstored_transaction.h
+42a57d99izTIjWfG-IjQAPqYlDWJNg tools/xenstore/xenstored_watch.c
+42a57d99-zLxBjzC7rfj_perV-orUg tools/xenstore/xenstored_watch.h
+42a57d99BnkhISKgCCRcUqhteyuxCw tools/xenstore/xs.c
+42a57d99FyiYSz9AkKKROrRydnA-gQ tools/xenstore/xs.h
+42a57d99SrtsJCDUlKyRPf3EX86A1Q tools/xenstore/xs_lib.c
+42a57d99L2pYeMFyjQ_4Rnb17xTSMg tools/xenstore/xs_lib.h
+42a57d99Kl6Ba8oCHv2fggl7QN9QZA tools/xenstore/xs_random.c
+42a57d99SHYR1lQOD0shuErPDg9NKQ tools/xenstore/xs_stress.c
+42a57d996aBawpkQNOWkNWXD6LrhPg tools/xenstore/xs_test.c
 403a3edbrr8RE34gkbR40zep98SXbg tools/xentrace/Makefile
 40a107afN60pFdURgBv9KwEzgRl5mQ tools/xentrace/formats
 420d52d2_znVbT4JAPIU36vQOme83g tools/xentrace/xenctx.c
diff --git a/BitKeeper/etc/ignore b/BitKeeper/etc/ignore
index d996d45a72..b591ce7458 100644
--- a/BitKeeper/etc/ignore
+++ b/BitKeeper/etc/ignore
@@ -128,8 +128,13 @@ tools/xcs/xcs
 tools/xcs/xcsdump
 tools/xcutils/xc_restore
 tools/xcutils/xc_save
+tools/xenstore/testsuite/tmp/*
+tools/xenstore/xen
+tools/xenstore/xenstored_test
+tools/xenstore/xs_random
+tools/xenstore/xs_stress
+tools/xenstore/xs_test
 tools/xentrace/xentrace
-tools/xfrd/xfrd
 xen/BLOG
 xen/TAGS
 xen/arch/x86/asm-offsets.s
diff --git a/tools/Makefile b/tools/Makefile
index 95e8989d4c..5e4a2bd586 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -9,6 +9,7 @@ SUBDIRS += xentrace
 SUBDIRS += python
 SUBDIRS += xcs
 SUBDIRS += xcutils
+SUBDIRS += xenstore
 SUBDIRS += pygrub
 
 .PHONY: all install clean check check_clean ioemu eioemuinstall ioemuclean
diff --git a/tools/xenstore/.gdbinit b/tools/xenstore/.gdbinit
new file mode 100644
index 0000000000..9a71b20ac4
--- /dev/null
+++ b/tools/xenstore/.gdbinit
@@ -0,0 +1,4 @@
+set environment XENSTORED_RUNDIR=testsuite/tmp
+set environment XENSTORED_ROOTDIR=testsuite/tmp
+handle SIGUSR1 noprint nostop
+handle SIGPIPE noprint nostop
diff --git a/tools/xenstore/Makefile b/tools/xenstore/Makefile
new file mode 100644
index 0000000000..cd4a7b3079
--- /dev/null
+++ b/tools/xenstore/Makefile
@@ -0,0 +1,97 @@
+XEN_ROOT=../..
+# This does something wrong to TARGET_ARCH.
+#include $(XEN_ROOT)/tools/Rules.mk
+LIBDIR = lib
+XEN_LIBXC          = $(XEN_ROOT)/tools/libxc
+
+INSTALL         = install
+INSTALL_DATA	= $(INSTALL) -m0644
+INSTALL_PROG    = $(INSTALL) -m0755
+INSTALL_DIR     = $(INSTALL) -d -m0755
+
+PROFILE=#-pg
+BASECFLAGS=-Wall -W -g 
+# Make gcc generate dependencies.
+BASECFLAGS += -Wp,-MD,.$(@F).d
+PROG_DEP = .*.d
+#BASECFLAGS+= -O3 $(PROFILE)
+#BASECFLAGS+= -I$(XEN_ROOT)/tools
+BASECFLAGS+= -I$(XEN_ROOT)/tools/libxc
+BASECFLAGS+= -I$(XEN_ROOT)/xen/include/public
+BASECFLAGS+= -I.
+
+CFLAGS+=$(BASECFLAGS)
+LDFLAGS=$(PROFILE) -L$(XEN_LIBXC)
+TESTDIR=`pwd`/testsuite/tmp
+TESTFLAGS=-DTESTING
+TESTENV=XENSTORED_ROOTDIR=$(TESTDIR) XENSTORED_RUNDIR=$(TESTDIR)
+
+all: xen xenstored libxenstore.a
+
+testcode: xen xs_test xenstored_test xs_random
+
+xen:
+	ln -sf $(XEN_ROOT)/xen/include/public $@
+
+xenstored: xenstored_core.o xenstored_watch.o xenstored_domain.o xenstored_transaction.o xs_lib.o talloc.o utils.o
+	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxc -o $@
+
+xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o
+	$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
+
+xs_test: xs_test.o xs_lib.o utils.o
+xs_random: xs_random.o xs_test_lib.o xs_lib.o talloc.o utils.o
+xs_stress: xs_stress.o xs_test_lib.o xs_lib.o talloc.o utils.o
+
+xs_test.o xs_stress.o xenstored_core_test.o xenstored_watch_test.o xenstored_transaction_test.o xenstored_domain_test.o xs_random.o xs_test_lib.o talloc_test.o fake_libxc.o: CFLAGS=$(BASECFLAGS) $(TESTFLAGS)
+
+xenstored_%_test.o: xenstored_%.c
+	$(COMPILE.c) -o $@ $<
+
+xs_test_lib.o: xs.c
+	$(COMPILE.c) -o $@ $<
+
+talloc_test.o: talloc.c
+	$(COMPILE.c) -o $@ $<
+
+libxenstore.a: libxenstore.a(xs.o) libxenstore.a(xs_lib.o)
+
+clean: testsuite-clean
+	rm -f *.o *.a xs_test xenstored xenstored_test xs_random xs_stress xen
+	-$(RM) $(PROG_DEP)
+
+check: testsuite-run randomcheck stresstest
+
+testsuite-run: xen xenstored_test xs_test
+	$(TESTENV) testsuite/test.sh
+
+testsuite-clean:
+	rm -rf $(TESTDIR)
+
+# Make this visible so they can see repeat tests without --fast if they
+# fail.
+RANDSEED=$(shell date +%s)
+randomcheck: xs_random xenstored_test
+	$(TESTENV) ./xs_random --simple --fast /tmp/xs_random 200000 $(RANDSEED)
+	$(TESTENV) ./xs_random --fast /tmp/xs_random 100000 $(RANDSEED)
+	$(TESTENV) ./xs_random --fail /tmp/xs_random 10000 $(RANDSEED)
+
+stresstest: xs_stress xenstored_test
+	rm -rf $(TESTDIR)/store
+	export $(TESTENV); PID=`./xenstored_test --output-pid`; ./xs_stress 10000; ret=$$?; kill $$PID; exit $$ret
+
+TAGS:
+	etags `find . -name '*.[ch]'`
+
+tarball: clean
+	cd .. && tar -c -j -v -h -f xenstore.tar.bz2 xenstore/
+
+install: xenstored libxenstore.a
+	$(INSTALL_DIR) -p $(DESTDIR)/var/run/xenstored
+	$(INSTALL_DIR) -p $(DESTDIR)/var/lib/xenstored
+	$(INSTALL_DIR) -p $(DESTDIR)/usr/sbin
+	$(INSTALL_PROG) xenstored $(DESTDIR)/usr/sbin
+	$(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR)
+	$(INSTALL_DATA) libxenstore.a $(DESTDIR)/usr/$(LIBDIR)
+
+-include $(PROG_DEP)
diff --git a/tools/xenstore/TODO b/tools/xenstore/TODO
new file mode 100644
index 0000000000..9e22afe536
--- /dev/null
+++ b/tools/xenstore/TODO
@@ -0,0 +1,7 @@
+TODO in no particular order.  Some of these will never be done.  There
+are omissions of important but necessary things.  It is up to the
+reader to fill in the blanks.
+
+- Remove calls to system() from daemon
+- Timeout failed watch responses
+- Timeout blocking transactions
diff --git a/tools/xenstore/fake_libxc.c b/tools/xenstore/fake_libxc.c
new file mode 100644
index 0000000000..decfb4001d
--- /dev/null
+++ b/tools/xenstore/fake_libxc.c
@@ -0,0 +1,119 @@
+/* 
+    Fake libxc which doesn't require hypervisor but talks to xs_test.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <assert.h>
+#include <signal.h>
+#include "utils.h"
+#include "xenstored_core.h"
+#include "xenstored_domain.h"
+#include "xenstored_test.h"
+
+static int sigfd;
+static int xs_test_pid;
+static u16 port;
+
+/* The event channel maps to a signal, shared page to an mmapped file. */
+int xc_evtchn_send(int xc_handle __attribute__((unused)), int local_port)
+{
+	assert(local_port == port);
+	if (kill(xs_test_pid, SIGUSR2) != 0)
+		barf_perror("fake event channel failed");
+	return 0;
+}
+
+void *xc_map_foreign_range(int xc_handle, u32 dom __attribute__((unused)),
+			   int size, int prot,
+			   unsigned long mfn __attribute__((unused)))
+{
+	void *ret;
+
+	ret = mmap(NULL, size, prot, MAP_SHARED, xc_handle, 0);
+	if (ret == MAP_FAILED)
+		return NULL;
+
+	/* xs_test tells us pid and port by putting it in buffer, we reply. */
+	xs_test_pid = *(int *)(ret + 32);
+	port = *(int *)(ret + 36);
+	*(int *)(ret + 32) = getpid();
+	return ret;
+}
+
+int xc_interface_open(void)
+{
+	int fd;
+	char page[getpagesize()];
+
+	fd = open("/tmp/xcmap", O_RDWR|O_CREAT|O_TRUNC, 0600);
+	if (fd < 0)
+		return fd;
+
+	memset(page, 0, sizeof(page));
+	if (!write_all(fd, page, sizeof(page)))
+		barf_perror("Failed to write /tmp/xcmap page");
+	
+	return fd;
+}
+
+int xc_interface_close(int xc_handle)
+{
+	close(xc_handle);
+	return 0;
+}
+
+static void send_to_fd(int signo __attribute__((unused)))
+{
+	int saved_errno = errno;
+	write(sigfd, &port, sizeof(port));
+	errno = saved_errno;
+}
+
+void fake_block_events(void)
+{
+	signal(SIGUSR2, SIG_IGN);
+}
+
+void fake_ack_event(void)
+{
+	signal(SIGUSR2, send_to_fd);
+}
+
+int fake_open_eventchn(void)
+{
+	int fds[2];
+
+	if (pipe(fds) != 0)
+		return -1;
+
+	if (signal(SIGUSR2, send_to_fd) == SIG_ERR) {
+		int saved_errno = errno;
+		close(fds[0]);
+		close(fds[1]);
+		errno = saved_errno;
+		return -1;
+	}
+	sigfd = fds[1];
+	return fds[0];
+}
diff --git a/tools/xenstore/list.h b/tools/xenstore/list.h
new file mode 100644
index 0000000000..eb35293d7f
--- /dev/null
+++ b/tools/xenstore/list.h
@@ -0,0 +1,508 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+/* Taken from Linux kernel code, but de-kernelized for userspace. */
+#include <stddef.h>
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+#define container_of(ptr, type, member) ({			\
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+	struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+	(ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+#define list_top(head, type, member)					  \
+({ 									  \
+	struct list_head *_head = (head);				  \
+	list_empty(_head) ? NULL : list_entry(_head->next, type, member); \
+})
+
+/*
+ * Insert a new entry between two known consecutive entries. 
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+/*
+ * Insert a new entry between two known consecutive entries. 
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static __inline__ void __list_add_rcu(struct list_head * new,
+	struct list_head * prev,
+	struct list_head * next)
+{
+	new->next = next;
+	new->prev = prev;
+	next->prev = new;
+	prev->next = new;
+}
+
+/**
+ * list_add_rcu - add a new entry to rcu-protected list
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static __inline__ void list_add_rcu(struct list_head *new, struct list_head *head)
+{
+	__list_add_rcu(new, head, head->next);
+}
+
+/**
+ * list_add_tail_rcu - add a new entry to rcu-protected list
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static __inline__ void list_add_tail_rcu(struct list_head *new, struct list_head *head)
+{
+	__list_add_rcu(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->next = LIST_POISON1;
+	entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_rcu - deletes entry from list without re-initialization
+ * @entry: the element to delete from the list.
+ *
+ * Note: list_empty on entry does not return true after this, 
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward 
+ * pointers that may still be used for walking the list.
+ */
+static inline void list_del_rcu(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry); 
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+				  struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add_tail(list, head);
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(struct list_head *head)
+{
+	return head->next == head;
+}
+
+static inline void __list_splice(struct list_head *list,
+				 struct list_head *head)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+				    struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:	the &struct list_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+	container_of(ptr, type, member)
+
+/**
+ * list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev	-	iterate over a list backwards
+ * @pos:	the &struct list_head to use as a loop counter.
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+	for (pos = (head)->prev; pos != (head); pos = pos->prev)
+        	
+/**
+ * list_for_each_safe	-	iterate over a list safe against removal of list entry
+ * @pos:	the &struct list_head to use as a loop counter.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry	-	iterate over list of given type
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)				\
+	for (pos = list_entry((head)->next, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)			\
+	for (pos = list_entry((head)->prev, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+
+/**
+ * list_for_each_entry_continue -	iterate over list of given type
+ *			continuing after existing point
+ * @pos:	the type * to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_continue(pos, head, member) 		\
+	for (pos = list_entry(pos->member.next, typeof(*pos), member);	\
+	     &pos->member != (head);	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop counter.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)			\
+	for (pos = list_entry((head)->next, typeof(*pos), member),	\
+		n = list_entry(pos->member.next, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+
+/* 
+ * Double linked lists with a single pointer list head. 
+ * Mostly useful for hash tables where the two pointer list head is 
+ * too wasteful.
+ * You lose the ability to access the tail in O(1).
+ */ 
+
+struct hlist_head { 
+	struct hlist_node *first; 
+}; 
+
+struct hlist_node { 
+	struct hlist_node *next, **pprev; 
+}; 
+
+#define HLIST_HEAD_INIT { .first = NULL } 
+#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) 
+#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL)
+
+static __inline__ int hlist_unhashed(struct hlist_node *h) 
+{ 
+	return !h->pprev;
+} 
+
+static __inline__ int hlist_empty(struct hlist_head *h) 
+{ 
+	return !h->first;
+} 
+
+static __inline__ void __hlist_del(struct hlist_node *n) 
+{
+	struct hlist_node *next = n->next;
+	struct hlist_node **pprev = n->pprev;
+	*pprev = next;  
+	if (next) 
+		next->pprev = pprev;
+}  
+
+static __inline__ void hlist_del(struct hlist_node *n)
+{
+	__hlist_del(n);
+	n->next = LIST_POISON1;
+	n->pprev = LIST_POISON2;
+}
+
+/**
+ * hlist_del_rcu - deletes entry from hash list without re-initialization
+ * @entry: the element to delete from the hash list.
+ *
+ * Note: list_unhashed() on entry does not return true after this, 
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward
+ * pointers that may still be used for walking the hash list.
+ */
+static inline void hlist_del_rcu(struct hlist_node *n)
+{
+	__hlist_del(n);
+	n->pprev = LIST_POISON2;
+}
+
+static __inline__ void hlist_del_init(struct hlist_node *n) 
+{
+	if (n->pprev)  {
+		__hlist_del(n);
+		INIT_HLIST_NODE(n);
+	}
+}  
+
+#define hlist_del_rcu_init hlist_del_init
+
+static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head *h) 
+{ 
+	struct hlist_node *first = h->first;
+	n->next = first; 
+	if (first) 
+		first->pprev = &n->next;
+	h->first = n; 
+	n->pprev = &h->first; 
+} 
+
+static __inline__ void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) 
+{ 
+	struct hlist_node *first = h->first;
+	n->next = first;
+	n->pprev = &h->first; 
+	if (first) 
+		first->pprev = &n->next;
+	h->first = n; 
+} 
+
+/* next must be != NULL */
+static __inline__ void hlist_add_before(struct hlist_node *n, struct hlist_node *next)
+{
+	n->pprev = next->pprev;
+	n->next = next; 
+	next->pprev = &n->next; 
+	*(n->pprev) = n;
+}
+
+static __inline__ void hlist_add_after(struct hlist_node *n,
+				       struct hlist_node *next)
+{
+	next->next	= n->next;
+	*(next->pprev)	= n;
+	n->next		= next;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+/* Cannot easily do prefetch unfortunately */
+#define hlist_for_each(pos, head) \
+	for (pos = (head)->first; pos; pos = pos->next) 
+
+#define hlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->first; n = pos ? pos->next : 0, pos; \
+	     pos = n)
+
+/**
+ * hlist_for_each_entry	- iterate over list of given type
+ * @tpos:	the type * to use as a loop counter.
+ * @pos:	the &struct hlist_node to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(tpos, pos, head, member)			 \
+	for (pos = (head)->first;					 \
+	     pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_continue - iterate over a hlist continuing after existing point
+ * @tpos:	the type * to use as a loop counter.
+ * @pos:	the &struct hlist_node to use as a loop counter.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(tpos, pos, member)		 \
+	for (pos = (pos)->next;						 \
+	     pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_from - iterate over a hlist continuing from existing point
+ * @tpos:	the type * to use as a loop counter.
+ * @pos:	the &struct hlist_node to use as a loop counter.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(tpos, pos, member)			 \
+	for (; pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @tpos:	the type * to use as a loop counter.
+ * @pos:	the &struct hlist_node to use as a loop counter.
+ * @n:		another &struct hlist_node to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(tpos, pos, n, head, member) 		 \
+	for (pos = (head)->first;					 \
+	     pos && ({ n = pos->next; 1; }) && 				 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = n)
+
+#endif
diff --git a/tools/xenstore/talloc.c b/tools/xenstore/talloc.c
new file mode 100644
index 0000000000..8e93c28fe3
--- /dev/null
+++ b/tools/xenstore/talloc.c
@@ -0,0 +1,1143 @@
+/* 
+   Samba Unix SMB/CIFS implementation.
+
+   Samba trivial allocation library - new interface
+
+   NOTE: Please read talloc_guide.txt for full documentation
+
+   Copyright (C) Andrew Tridgell 2004
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+/*
+  inspired by http://swapped.cc/halloc/
+*/
+
+
+#ifdef _SAMBA_BUILD_
+#include "includes.h"
+#if ((SAMBA_VERSION_MAJOR==3)&&(SAMBA_VERSION_MINOR<9))
+/* This is to circumvent SAMBA3's paranoid malloc checker. Here in this file
+ * we trust ourselves... */
+#ifdef malloc
+#undef malloc
+#endif
+#ifdef realloc
+#undef realloc
+#endif
+#endif
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include "talloc.h"
+/* assume a modern system */
+#define HAVE_VA_COPY
+#endif
+
+/* use this to force every realloc to change the pointer, to stress test
+   code that might not cope */
+#ifdef TESTING
+#define ALWAYS_REALLOC 1
+void *test_malloc(size_t size);
+#define malloc test_malloc
+#endif
+
+#define MAX_TALLOC_SIZE 0x10000000
+#define TALLOC_MAGIC 0xe814ec4f
+#define TALLOC_MAGIC_FREE 0x7faebef3
+#define TALLOC_MAGIC_REFERENCE ((const char *)1)
+
+/* by default we abort when given a bad pointer (such as when talloc_free() is called 
+   on a pointer that came from malloc() */
+#ifndef TALLOC_ABORT
+#define TALLOC_ABORT(reason) abort()
+#endif
+
+#ifndef discard_const_p
+#if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
+# define discard_const_p(type, ptr) ((type *)((intptr_t)(ptr)))
+#else
+# define discard_const_p(type, ptr) ((type *)(ptr))
+#endif
+#endif
+
+/* this null_context is only used if talloc_enable_leak_report() or
+   talloc_enable_leak_report_full() is called, otherwise it remains
+   NULL
+*/
+static const void *null_context;
+static void *cleanup_context;
+static int (*malloc_fail_handler)(void *);
+static void *malloc_fail_data;
+
+struct talloc_reference_handle {
+	struct talloc_reference_handle *next, *prev;
+	void *ptr;
+};
+
+typedef int (*talloc_destructor_t)(void *);
+
+struct talloc_chunk {
+	struct talloc_chunk *next, *prev;
+	struct talloc_chunk *parent, *child;
+	struct talloc_reference_handle *refs;
+	size_t size;
+	unsigned magic;
+	talloc_destructor_t destructor;
+	const char *name;
+};
+
+/* panic if we get a bad magic value */
+static struct talloc_chunk *talloc_chunk_from_ptr(const void *ptr)
+{
+	struct talloc_chunk *tc = discard_const_p(struct talloc_chunk, ptr)-1;
+	if (tc->magic != TALLOC_MAGIC) { 
+		if (tc->magic == TALLOC_MAGIC_FREE) {
+			TALLOC_ABORT("Bad talloc magic value - double free"); 
+		} else {
+			TALLOC_ABORT("Bad talloc magic value - unknown value"); 
+		}
+	}
+
+	return tc;
+}
+
+/* hook into the front of the list */
+#define _TLIST_ADD(list, p) \
+do { \
+        if (!(list)) { \
+		(list) = (p); \
+		(p)->next = (p)->prev = NULL; \
+	} else { \
+		(list)->prev = (p); \
+		(p)->next = (list); \
+		(p)->prev = NULL; \
+		(list) = (p); \
+	}\
+} while (0)
+
+/* remove an element from a list - element doesn't have to be in list. */
+#define _TLIST_REMOVE(list, p) \
+do { \
+	if ((p) == (list)) { \
+		(list) = (p)->next; \
+		if (list) (list)->prev = NULL; \
+	} else { \
+		if ((p)->prev) (p)->prev->next = (p)->next; \
+		if ((p)->next) (p)->next->prev = (p)->prev; \
+	} \
+	if ((p) && ((p) != (list))) (p)->next = (p)->prev = NULL; \
+} while (0)
+
+
+/*
+  return the parent chunk of a pointer
+*/
+static struct talloc_chunk *talloc_parent_chunk(const void *ptr)
+{
+	struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+	while (tc->prev) tc=tc->prev;
+	return tc->parent;
+}
+
+void *talloc_parent(const void *ptr)
+{
+	struct talloc_chunk *tc = talloc_parent_chunk(ptr);
+	return (void *)(tc+1);
+}
+
+/* 
+   Allocate a bit of memory as a child of an existing pointer
+*/
+void *_talloc(const void *context, size_t size)
+{
+	struct talloc_chunk *tc;
+
+	if (context == NULL) {
+		context = null_context;
+	}
+
+	if (size >= MAX_TALLOC_SIZE) {
+		return NULL;
+	}
+
+	tc = malloc(sizeof(*tc)+size);
+	if (tc == NULL) {
+		if (malloc_fail_handler)
+			if (malloc_fail_handler(malloc_fail_data))
+				tc = malloc(sizeof(*tc)+size);
+		if (!tc)
+			return NULL;
+	}
+
+	tc->size = size;
+	tc->magic = TALLOC_MAGIC;
+	tc->destructor = NULL;
+	tc->child = NULL;
+	tc->name = NULL;
+	tc->refs = NULL;
+
+	if (context) {
+		struct talloc_chunk *parent = talloc_chunk_from_ptr(context);
+
+		tc->parent = parent;
+
+		if (parent->child) {
+			parent->child->parent = NULL;
+		}
+
+		_TLIST_ADD(parent->child, tc);
+	} else {
+		tc->next = tc->prev = tc->parent = NULL;
+	}
+
+	return (void *)(tc+1);
+}
+
+
+/*
+  setup a destructor to be called on free of a pointer
+  the destructor should return 0 on success, or -1 on failure.
+  if the destructor fails then the free is failed, and the memory can
+  be continued to be used
+*/
+void talloc_set_destructor(const void *ptr, int (*destructor)(void *))
+{
+	struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+	tc->destructor = destructor;
+}
+
+/*
+  increase the reference count on a piece of memory. 
+*/
+void talloc_increase_ref_count(const void *ptr)
+{
+	talloc_reference(null_context, ptr);
+}
+
+/*
+  helper for talloc_reference()
+*/
+static int talloc_reference_destructor(void *ptr)
+{
+	struct talloc_reference_handle *handle = ptr;
+	struct talloc_chunk *tc1 = talloc_chunk_from_ptr(ptr);
+	struct talloc_chunk *tc2 = talloc_chunk_from_ptr(handle->ptr);
+	if (tc1->destructor != (talloc_destructor_t)-1) {
+		tc1->destructor = NULL;
+	}
+	_TLIST_REMOVE(tc2->refs, handle);
+	talloc_free(handle);
+	return 0;
+}
+
+/*
+  make a secondary reference to a pointer, hanging off the given context.
+  the pointer remains valid until both the original caller and this given
+  context are freed.
+  
+  the major use for this is when two different structures need to reference the 
+  same underlying data, and you want to be able to free the two instances separately,
+  and in either order
+*/
+void *talloc_reference(const void *context, const void *ptr)
+{
+	struct talloc_chunk *tc;
+	struct talloc_reference_handle *handle;
+	if (ptr == NULL) return NULL;
+
+	tc = talloc_chunk_from_ptr(ptr);
+	handle = talloc_named_const(context, sizeof(*handle), TALLOC_MAGIC_REFERENCE);
+
+	if (handle == NULL) return NULL;
+
+	/* note that we hang the destructor off the handle, not the
+	   main context as that allows the caller to still setup their
+	   own destructor on the context if they want to */
+	talloc_set_destructor(handle, talloc_reference_destructor);
+	handle->ptr = discard_const_p(void, ptr);
+	_TLIST_ADD(tc->refs, handle);
+	return handle->ptr;
+}
+
+/*
+  remove a secondary reference to a pointer. This undo's what
+  talloc_reference() has done. The context and pointer arguments
+  must match those given to a talloc_reference()
+*/
+static int talloc_unreference(const void *context, const void *ptr)
+{
+	struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+	struct talloc_reference_handle *h;
+
+	if (context == NULL) {
+		context = null_context;
+	}
+
+	for (h=tc->refs;h;h=h->next) {
+		struct talloc_chunk *p = talloc_parent_chunk(h);
+		if ((p==NULL && context==NULL) || p+1 == context) break;
+	}
+	if (h == NULL) {
+		return -1;
+	}
+
+	talloc_set_destructor(h, NULL);
+	_TLIST_REMOVE(tc->refs, h);
+	talloc_free(h);
+	return 0;
+}
+
+/*
+  remove a specific parent context from a pointer. This is a more
+  controlled varient of talloc_free()
+*/
+int talloc_unlink(const void *context, void *ptr)
+{
+	struct talloc_chunk *tc_p, *new_p;
+	void *new_parent;
+
+	if (ptr == NULL) {
+		return -1;
+	}
+
+	if (context == NULL) {
+		context = null_context;
+	}
+
+	if (talloc_unreference(context, ptr) == 0) {
+		return 0;
+	}
+
+	if (context == NULL) {
+		if (talloc_parent_chunk(ptr) != NULL) {
+			return -1;
+		}
+	} else {
+		if (talloc_chunk_from_ptr(context) != talloc_parent_chunk(ptr)) {
+			return -1;
+		}
+	}
+	
+	tc_p = talloc_chunk_from_ptr(ptr);
+
+	if (tc_p->refs == NULL) {
+		return talloc_free(ptr);
+	}
+
+	new_p = talloc_parent_chunk(tc_p->refs);
+	if (new_p) {
+		new_parent = new_p+1;
+	} else {
+		new_parent = NULL;
+	}
+
+	if (talloc_unreference(new_parent, ptr) != 0) {
+		return -1;
+	}
+
+	talloc_steal(new_parent, ptr);
+
+	return 0;
+}
+
+/*
+  add a name to an existing pointer - va_list version
+*/
+static void talloc_set_name_v(const void *ptr, const char *fmt, va_list ap) PRINTF_ATTRIBUTE(2,0);
+
+static void talloc_set_name_v(const void *ptr, const char *fmt, va_list ap)
+{
+	struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+	tc->name = talloc_vasprintf(ptr, fmt, ap);
+	if (tc->name) {
+		talloc_set_name_const(tc->name, ".name");
+	}
+}
+
+/*
+  add a name to an existing pointer
+*/
+void talloc_set_name(const void *ptr, const char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	talloc_set_name_v(ptr, fmt, ap);
+	va_end(ap);
+}
+
+/*
+   more efficient way to add a name to a pointer - the name must point to a 
+   true string constant
+*/
+void talloc_set_name_const(const void *ptr, const char *name)
+{
+	struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+	tc->name = name;
+}
+
+/*
+  create a named talloc pointer. Any talloc pointer can be named, and
+  talloc_named() operates just like talloc() except that it allows you
+  to name the pointer.
+*/
+void *talloc_named(const void *context, size_t size, const char *fmt, ...)
+{
+	va_list ap;
+	void *ptr;
+
+	ptr = _talloc(context, size);
+	if (ptr == NULL) return NULL;
+
+	va_start(ap, fmt);
+	talloc_set_name_v(ptr, fmt, ap);
+	va_end(ap);
+
+	return ptr;
+}
+
+/*
+  create a named talloc pointer. Any talloc pointer can be named, and
+  talloc_named() operates just like talloc() except that it allows you
+  to name the pointer.
+*/
+void *talloc_named_const(const void *context, size_t size, const char *name)
+{
+	void *ptr;
+
+	ptr = _talloc(context, size);
+	if (ptr == NULL) {
+		return NULL;
+	}
+
+	talloc_set_name_const(ptr, name);
+
+	return ptr;
+}
+
+/*
+  return the name of a talloc ptr, or "UNNAMED"
+*/
+const char *talloc_get_name(const void *ptr)
+{
+	struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+	if (tc->name == TALLOC_MAGIC_REFERENCE) {
+		return ".reference";
+	}
+	if (tc->name) {
+		return tc->name;
+	}
+	return "UNNAMED";
+}
+
+
+/*
+  check if a pointer has the given name. If it does, return the pointer,
+  otherwise return NULL
+*/
+void *talloc_check_name(const void *ptr, const char *name)
+{
+	const char *pname;
+	if (ptr == NULL) return NULL;
+	pname = talloc_get_name(ptr);
+	if (pname == name || strcmp(pname, name) == 0) {
+		return discard_const_p(void, ptr);
+	}
+	return NULL;
+}
+
+
+/*
+  this is for compatibility with older versions of talloc
+*/
+void *talloc_init(const char *fmt, ...)
+{
+	va_list ap;
+	void *ptr;
+
+	ptr = _talloc(NULL, 0);
+	if (ptr == NULL) return NULL;
+
+	va_start(ap, fmt);
+	talloc_set_name_v(ptr, fmt, ap);
+	va_end(ap);
+
+	return ptr;
+}
+
+/*
+  this is a replacement for the Samba3 talloc_destroy_pool functionality. It
+  should probably not be used in new code. It's in here to keep the talloc
+  code consistent across Samba 3 and 4.
+*/
+void talloc_free_children(void *ptr)
+{
+	struct talloc_chunk *tc;
+
+	if (ptr == NULL) {
+		return;
+	}
+
+	tc = talloc_chunk_from_ptr(ptr);
+
+	while (tc->child) {
+		/* we need to work out who will own an abandoned child
+		   if it cannot be freed. In priority order, the first
+		   choice is owner of any remaining reference to this
+		   pointer, the second choice is our parent, and the
+		   final choice is the null context. */
+		void *child = tc->child+1;
+		const void *new_parent = null_context;
+		if (tc->child->refs) {
+			struct talloc_chunk *p = talloc_parent_chunk(tc->child->refs);
+			if (p) new_parent = p+1;
+		}
+		if (talloc_free(child) == -1) {
+			if (new_parent == null_context) {
+				struct talloc_chunk *p = talloc_parent_chunk(ptr);
+				if (p) new_parent = p+1;
+			}
+			talloc_steal(new_parent, child);
+		}
+	}
+}
+
+/* 
+   free a talloc pointer. This also frees all child pointers of this 
+   pointer recursively
+
+   return 0 if the memory is actually freed, otherwise -1. The memory
+   will not be freed if the ref_count is > 1 or the destructor (if
+   any) returns non-zero
+*/
+int talloc_free(void *ptr)
+{
+	struct talloc_chunk *tc;
+
+	if (ptr == NULL) {
+		return -1;
+	}
+
+	tc = talloc_chunk_from_ptr(ptr);
+
+	if (tc->refs) {
+		talloc_reference_destructor(tc->refs);
+		return -1;
+	}
+
+	if (tc->destructor) {
+		talloc_destructor_t d = tc->destructor;
+		if (d == (talloc_destructor_t)-1) {
+			return -1;
+		}
+		tc->destructor = (talloc_destructor_t)-1;
+		if (d(ptr) == -1) {
+			tc->destructor = d;
+			return -1;
+		}
+		tc->destructor = NULL;
+	}
+
+	talloc_free_children(ptr);
+
+	if (tc->parent) {
+		_TLIST_REMOVE(tc->parent->child, tc);
+		if (tc->parent->child) {
+			tc->parent->child->parent = tc->parent;
+		}
+	} else {
+		if (tc->prev) tc->prev->next = tc->next;
+		if (tc->next) tc->next->prev = tc->prev;
+	}
+
+	tc->magic = TALLOC_MAGIC_FREE;
+
+	free(tc);
+	return 0;
+}
+
+
+
+/*
+  A talloc version of realloc. The context argument is only used if
+  ptr is NULL
+*/
+void *_talloc_realloc(const void *context, void *ptr, size_t size, const char *name)
+{
+	struct talloc_chunk *tc;
+	void *new_ptr;
+
+	/* size zero is equivalent to free() */
+	if (size == 0) {
+		talloc_free(ptr);
+		return NULL;
+	}
+
+	if (size >= MAX_TALLOC_SIZE) {
+		return NULL;
+	}
+
+	/* realloc(NULL) is equavalent to malloc() */
+	if (ptr == NULL) {
+		return talloc_named_const(context, size, name);
+	}
+
+	tc = talloc_chunk_from_ptr(ptr);
+
+	/* don't allow realloc on referenced pointers */
+	if (tc->refs) {
+		return NULL;
+	}
+
+	/* by resetting magic we catch users of the old memory */
+	tc->magic = TALLOC_MAGIC_FREE;
+
+#if ALWAYS_REALLOC
+	new_ptr = malloc(size + sizeof(*tc));
+	if (!new_ptr) {
+		tc->magic = TALLOC_MAGIC; 
+		if (malloc_fail_handler)
+			if (malloc_fail_handler(malloc_fail_data))
+				new_ptr = malloc(size + sizeof(*tc));
+	}
+	if (new_ptr) {
+		memcpy(new_ptr, tc, tc->size + sizeof(*tc));
+		free(tc);
+	}
+#else
+	new_ptr = realloc(tc, size + sizeof(*tc));
+	if (!new_ptr) {
+		tc->magic = TALLOC_MAGIC; 
+		if (malloc_fail_handler)
+			if (malloc_fail_handler(malloc_fail_data))
+				new_ptr = realloc(tc, size + sizeof(*tc));
+	}
+#endif
+	if (!new_ptr) {	
+		tc->magic = TALLOC_MAGIC; 
+		return NULL; 
+	}
+
+	tc = new_ptr;
+	tc->magic = TALLOC_MAGIC;
+	if (tc->parent) {
+		tc->parent->child = new_ptr;
+	}
+	if (tc->child) {
+		tc->child->parent = new_ptr;
+	}
+
+	if (tc->prev) {
+		tc->prev->next = tc;
+	}
+	if (tc->next) {
+		tc->next->prev = tc;
+	}
+
+	tc->size = size;
+	talloc_set_name_const(tc+1, name);
+
+	return (void *)(tc+1);
+}
+
+/* 
+   move a lump of memory from one talloc context to another return the
+   ptr on success, or NULL if it could not be transferred.
+   passing NULL as ptr will always return NULL with no side effects.
+*/
+void *talloc_steal(const void *new_ctx, const void *ptr)
+{
+	struct talloc_chunk *tc, *new_tc;
+
+	if (!ptr) {
+		return NULL;
+	}
+
+	if (new_ctx == NULL) {
+		new_ctx = null_context;
+	}
+
+	tc = talloc_chunk_from_ptr(ptr);
+
+	if (new_ctx == NULL) {
+		if (tc->parent) {
+			_TLIST_REMOVE(tc->parent->child, tc);
+			if (tc->parent->child) {
+				tc->parent->child->parent = tc->parent;
+			}
+		} else {
+			if (tc->prev) tc->prev->next = tc->next;
+			if (tc->next) tc->next->prev = tc->prev;
+		}
+		
+		tc->parent = tc->next = tc->prev = NULL;
+		return discard_const_p(void, ptr);
+	}
+
+	new_tc = talloc_chunk_from_ptr(new_ctx);
+
+	if (tc == new_tc) {
+		return discard_const_p(void, ptr);
+	}
+
+	if (tc->parent) {
+		_TLIST_REMOVE(tc->parent->child, tc);
+		if (tc->parent->child) {
+			tc->parent->child->parent = tc->parent;
+		}
+	} else {
+		if (tc->prev) tc->prev->next = tc->next;
+		if (tc->next) tc->next->prev = tc->prev;
+	}
+
+	tc->parent = new_tc;
+	if (new_tc->child) new_tc->child->parent = NULL;
+	_TLIST_ADD(new_tc->child, tc);
+
+	return discard_const_p(void, ptr);
+}
+
+/*
+  return the total size of a talloc pool (subtree)
+*/
+off_t talloc_total_size(const void *ptr)
+{
+	off_t total = 0;
+	struct talloc_chunk *c, *tc;
+	
+	if (ptr == NULL) {
+		ptr = null_context;
+	}
+	if (ptr == NULL) {
+		return 0;
+	}
+
+	tc = talloc_chunk_from_ptr(ptr);
+
+	total = tc->size;
+	for (c=tc->child;c;c=c->next) {
+		total += talloc_total_size(c+1);
+	}
+	return total;
+}
+
+/*
+  return the total number of blocks in a talloc pool (subtree)
+*/
+off_t talloc_total_blocks(const void *ptr)
+{
+	off_t total = 0;
+	struct talloc_chunk *c, *tc;
+
+	if (ptr == NULL) {
+		ptr = null_context;
+	}
+	if (ptr == NULL) {
+		return 0;
+	}
+	tc = talloc_chunk_from_ptr(ptr);
+
+	total++;
+	for (c=tc->child;c;c=c->next) {
+		total += talloc_total_blocks(c+1);
+	}
+	return total;
+}
+
+/*
+  return the number of external references to a pointer
+*/
+static int talloc_reference_count(const void *ptr)
+{
+	struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+	struct talloc_reference_handle *h;
+	int ret = 0;
+
+	for (h=tc->refs;h;h=h->next) {
+		ret++;
+	}
+	return ret;
+}
+
+/*
+  report on memory usage by all children of a pointer, giving a full tree view
+*/
+void talloc_report_depth(const void *ptr, FILE *f, int depth)
+{
+	struct talloc_chunk *c, *tc = talloc_chunk_from_ptr(ptr);
+
+	for (c=tc->child;c;c=c->next) {
+		if (c->name == TALLOC_MAGIC_REFERENCE) {
+			struct talloc_reference_handle *handle = (void *)(c+1);
+			const char *name2 = talloc_get_name(handle->ptr);
+			fprintf(f, "%*sreference to: %s\n", depth*4, "", name2);
+		} else {
+			const char *name = talloc_get_name(c+1);
+			fprintf(f, "%*s%-30s contains %6lu bytes in %3lu blocks (ref %d)\n", 
+				depth*4, "",
+				name,
+				(unsigned long)talloc_total_size(c+1),
+				(unsigned long)talloc_total_blocks(c+1),
+				talloc_reference_count(c+1));
+			talloc_report_depth(c+1, f, depth+1);
+		}
+	}
+
+}
+
+/*
+  report on memory usage by all children of a pointer, giving a full tree view
+*/
+void talloc_report_full(const void *ptr, FILE *f)
+{
+	if (ptr == NULL) {
+		ptr = null_context;
+	}
+	if (ptr == NULL) return;
+
+	fprintf(f,"full talloc report on '%s' (total %lu bytes in %lu blocks)\n", 
+		talloc_get_name(ptr), 
+		(unsigned long)talloc_total_size(ptr),
+		(unsigned long)talloc_total_blocks(ptr));
+
+	talloc_report_depth(ptr, f, 1);
+	fflush(f);
+}
+
+/*
+  report on memory usage by all children of a pointer
+*/
+void talloc_report(const void *ptr, FILE *f)
+{
+	struct talloc_chunk *c, *tc;
+
+	if (ptr == NULL) {
+		ptr = null_context;
+	}
+	if (ptr == NULL) return;
+       
+	fprintf(f,"talloc report on '%s' (total %lu bytes in %lu blocks)\n", 
+		talloc_get_name(ptr), 
+		(unsigned long)talloc_total_size(ptr),
+		(unsigned long)talloc_total_blocks(ptr));
+
+	tc = talloc_chunk_from_ptr(ptr);
+
+	for (c=tc->child;c;c=c->next) {
+		fprintf(f, "\t%-30s contains %6lu bytes in %3lu blocks\n", 
+			talloc_get_name(c+1),
+			(unsigned long)talloc_total_size(c+1),
+			(unsigned long)talloc_total_blocks(c+1));
+	}
+	fflush(f);
+}
+
+/*
+  report on any memory hanging off the null context
+*/
+static void talloc_report_null(void)
+{
+	if (talloc_total_size(null_context) != 0) {
+		talloc_report(null_context, stderr);
+	}
+}
+
+/*
+  report on any memory hanging off the null context
+*/
+static void talloc_report_null_full(void)
+{
+	if (talloc_total_size(null_context) != 0) {
+		talloc_report_full(null_context, stderr);
+	}
+}
+
+/*
+  enable tracking of the NULL context
+*/
+void talloc_enable_null_tracking(void)
+{
+	if (null_context == NULL) {
+		null_context = talloc_named_const(NULL, 0, "null_context");
+	}
+}
+
+/*
+  enable leak reporting on exit
+*/
+void talloc_enable_leak_report(void)
+{
+	talloc_enable_null_tracking();
+	atexit(talloc_report_null);
+}
+
+/*
+  enable full leak reporting on exit
+*/
+void talloc_enable_leak_report_full(void)
+{
+	talloc_enable_null_tracking();
+	atexit(talloc_report_null_full);
+}
+
+/* 
+   talloc and zero memory. 
+*/
+void *_talloc_zero(const void *ctx, size_t size, const char *name)
+{
+	void *p = talloc_named_const(ctx, size, name);
+
+	if (p) {
+		memset(p, '\0', size);
+	}
+
+	return p;
+}
+
+
+/*
+  memdup with a talloc. 
+*/
+void *_talloc_memdup(const void *t, const void *p, size_t size, const char *name)
+{
+	void *newp = talloc_named_const(t, size, name);
+
+	if (newp) {
+		memcpy(newp, p, size);
+	}
+
+	return newp;
+}
+
+/*
+  strdup with a talloc 
+*/
+char *talloc_strdup(const void *t, const char *p)
+{
+	char *ret;
+	if (!p) {
+		return NULL;
+	}
+	ret = talloc_memdup(t, p, strlen(p) + 1);
+	if (ret) {
+		talloc_set_name_const(ret, ret);
+	}
+	return ret;
+}
+
+/*
+  strndup with a talloc 
+*/
+char *talloc_strndup(const void *t, const char *p, size_t n)
+{
+	size_t len;
+	char *ret;
+
+	for (len=0; p[len] && len<n; len++) ;
+
+	ret = _talloc(t, len + 1);
+	if (!ret) { return NULL; }
+	memcpy(ret, p, len);
+	ret[len] = 0;
+	talloc_set_name_const(ret, ret);
+	return ret;
+}
+
+#ifndef VA_COPY
+#ifdef HAVE_VA_COPY
+#define VA_COPY(dest, src) va_copy(dest, src)
+#elif defined(HAVE___VA_COPY)
+#define VA_COPY(dest, src) __va_copy(dest, src)
+#else
+#define VA_COPY(dest, src) (dest) = (src)
+#endif
+#endif
+
+char *talloc_vasprintf(const void *t, const char *fmt, va_list ap)
+{	
+	int len;
+	char *ret;
+	va_list ap2;
+	
+	VA_COPY(ap2, ap);
+
+	len = vsnprintf(NULL, 0, fmt, ap2);
+
+	ret = _talloc(t, len+1);
+	if (ret) {
+		VA_COPY(ap2, ap);
+		vsnprintf(ret, len+1, fmt, ap2);
+		talloc_set_name_const(ret, ret);
+	}
+
+	return ret;
+}
+
+
+/*
+  Perform string formatting, and return a pointer to newly allocated
+  memory holding the result, inside a memory pool.
+ */
+char *talloc_asprintf(const void *t, const char *fmt, ...)
+{
+	va_list ap;
+	char *ret;
+
+	va_start(ap, fmt);
+	ret = talloc_vasprintf(t, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
+
+/**
+ * Realloc @p s to append the formatted result of @p fmt and @p ap,
+ * and return @p s, which may have moved.  Good for gradually
+ * accumulating output into a string buffer.
+ **/
+
+static char *talloc_vasprintf_append(char *s, const char *fmt, va_list ap) PRINTF_ATTRIBUTE(2,0);
+
+static char *talloc_vasprintf_append(char *s, const char *fmt, va_list ap)
+{	
+	struct talloc_chunk *tc;
+	int len, s_len;
+	va_list ap2;
+
+	if (s == NULL) {
+		return talloc_vasprintf(NULL, fmt, ap);
+	}
+
+	tc = talloc_chunk_from_ptr(s);
+
+	VA_COPY(ap2, ap);
+
+	s_len = tc->size - 1;
+	len = vsnprintf(NULL, 0, fmt, ap2);
+
+	s = talloc_realloc(NULL, s, char, s_len + len+1);
+	if (!s) return NULL;
+
+	VA_COPY(ap2, ap);
+
+	vsnprintf(s+s_len, len+1, fmt, ap2);
+	talloc_set_name_const(s, s);
+
+	return s;
+}
+
+/*
+  Realloc @p s to append the formatted result of @p fmt and return @p
+  s, which may have moved.  Good for gradually accumulating output
+  into a string buffer.
+ */
+char *talloc_asprintf_append(char *s, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	s = talloc_vasprintf_append(s, fmt, ap);
+	va_end(ap);
+	return s;
+}
+
+/*
+  alloc an array, checking for integer overflow in the array size
+*/
+void *_talloc_array(const void *ctx, size_t el_size, unsigned count, const char *name)
+{
+	if (count >= MAX_TALLOC_SIZE/el_size) {
+		return NULL;
+	}
+	return talloc_named_const(ctx, el_size * count, name);
+}
+
+/*
+  alloc an zero array, checking for integer overflow in the array size
+*/
+void *_talloc_zero_array(const void *ctx, size_t el_size, unsigned count, const char *name)
+{
+	if (count >= MAX_TALLOC_SIZE/el_size) {
+		return NULL;
+	}
+	return _talloc_zero(ctx, el_size * count, name);
+}
+
+
+/*
+  realloc an array, checking for integer overflow in the array size
+*/
+void *_talloc_realloc_array(const void *ctx, void *ptr, size_t el_size, unsigned count, const char *name)
+{
+	if (count >= MAX_TALLOC_SIZE/el_size) {
+		return NULL;
+	}
+	return _talloc_realloc(ctx, ptr, el_size * count, name);
+}
+
+/*
+  a function version of talloc_realloc(), so it can be passed as a function pointer
+  to libraries that want a realloc function (a realloc function encapsulates
+  all the basic capabilities of an allocation library, which is why this is useful)
+*/
+void *talloc_realloc_fn(const void *context, void *ptr, size_t size)
+{
+	return _talloc_realloc(context, ptr, size, NULL);
+}
+
+
+static void talloc_autofree(void)
+{
+	talloc_free(cleanup_context);
+	cleanup_context = NULL;
+}
+
+/*
+  return a context which will be auto-freed on exit
+  this is useful for reducing the noise in leak reports
+*/
+void *talloc_autofree_context(void)
+{
+	if (cleanup_context == NULL) {
+		cleanup_context = talloc_named_const(NULL, 0, "autofree_context");
+		atexit(talloc_autofree);
+	}
+	return cleanup_context;
+}
+
+size_t talloc_get_size(const void *context)
+{
+	struct talloc_chunk *tc;
+
+	if (context == NULL)
+		return 0;
+
+	tc = talloc_chunk_from_ptr(context);
+
+	return tc->size;
+}
+
+talloc_fail_handler *talloc_set_fail_handler(talloc_fail_handler *handler,
+					     void *data)
+{
+	talloc_fail_handler *old = malloc_fail_handler;
+	malloc_fail_handler = handler;
+	malloc_fail_data = data;
+	return old;
+}
diff --git a/tools/xenstore/talloc.h b/tools/xenstore/talloc.h
new file mode 100644
index 0000000000..39bcb53fb7
--- /dev/null
+++ b/tools/xenstore/talloc.h
@@ -0,0 +1,134 @@
+#ifndef _TALLOC_H_
+#define _TALLOC_H_
+/* 
+   Unix SMB/CIFS implementation.
+   Samba temporary memory allocation functions
+
+   Copyright (C) Andrew Tridgell 2004-2005
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+/* this is only needed for compatibility with the old talloc */
+typedef void TALLOC_CTX;
+
+/*
+  this uses a little trick to allow __LINE__ to be stringified
+*/
+#define _STRING_LINE_(s)    #s
+#define _STRING_LINE2_(s)   _STRING_LINE_(s)
+#define __LINESTR__       _STRING_LINE2_(__LINE__)
+#define __location__ __FILE__ ":" __LINESTR__
+
+#ifndef TALLOC_DEPRECATED
+#define TALLOC_DEPRECATED 0
+#endif
+
+/* useful macros for creating type checked pointers */
+#define talloc(ctx, type) (type *)talloc_named_const(ctx, sizeof(type), #type)
+#define talloc_size(ctx, size) talloc_named_const(ctx, size, __location__)
+
+#define talloc_new(ctx) talloc_named_const(ctx, 0, "talloc_new: " __location__)
+
+#define talloc_zero(ctx, type) (type *)_talloc_zero(ctx, sizeof(type), #type)
+#define talloc_zero_size(ctx, size) _talloc_zero(ctx, size, __location__)
+
+#define talloc_zero_array(ctx, type, count) (type *)_talloc_zero_array(ctx, sizeof(type), count, #type)
+#define talloc_array(ctx, type, count) (type *)_talloc_array(ctx, sizeof(type), count, #type)
+#define talloc_array_size(ctx, size, count) _talloc_array(ctx, size, count, __location__)
+
+#define talloc_realloc(ctx, p, type, count) (type *)_talloc_realloc_array(ctx, p, sizeof(type), count, #type)
+#define talloc_realloc_size(ctx, ptr, size) _talloc_realloc(ctx, ptr, size, __location__)
+
+#define talloc_memdup(t, p, size) _talloc_memdup(t, p, size, __location__)
+
+#define malloc_p(type) (type *)malloc(sizeof(type))
+#define malloc_array_p(type, count) (type *)realloc_array(NULL, sizeof(type), count)
+#define realloc_p(p, type, count) (type *)realloc_array(p, sizeof(type), count)
+
+#define data_blob(ptr, size) data_blob_named(ptr, size, "DATA_BLOB: "__location__)
+#define data_blob_talloc(ctx, ptr, size) data_blob_talloc_named(ctx, ptr, size, "DATA_BLOB: "__location__)
+#define data_blob_dup_talloc(ctx, blob) data_blob_talloc_named(ctx, (blob)->data, (blob)->length, "DATA_BLOB: "__location__)
+
+#define talloc_set_type(ptr, type) talloc_set_name_const(ptr, #type)
+#define talloc_get_type(ptr, type) (type *)talloc_check_name(ptr, #type)
+
+
+#if TALLOC_DEPRECATED
+#define talloc_zero_p(ctx, type) talloc_zero(ctx, type)
+#define talloc_p(ctx, type) talloc(ctx, type)
+#define talloc_array_p(ctx, type, count) talloc_array(ctx, type, count)
+#define talloc_realloc_p(ctx, p, type, count) talloc_realloc(ctx, p, type, count)
+#define talloc_destroy(ctx) talloc_free(ctx)
+#endif
+
+#ifndef PRINTF_ATTRIBUTE
+#if (__GNUC__ >= 3)
+/** Use gcc attribute to check printf fns.  a1 is the 1-based index of
+ * the parameter containing the format, and a2 the index of the first
+ * argument. Note that some gcc 2.x versions don't handle this
+ * properly **/
+#define PRINTF_ATTRIBUTE(a1, a2) __attribute__ ((format (__printf__, a1, a2)))
+#else
+#define PRINTF_ATTRIBUTE(a1, a2)
+#endif
+#endif
+
+
+/* The following definitions come from talloc.c  */
+void *_talloc(const void *context, size_t size);
+void talloc_set_destructor(const void *ptr, int (*destructor)(void *));
+void talloc_increase_ref_count(const void *ptr);
+void *talloc_reference(const void *context, const void *ptr);
+int talloc_unlink(const void *context, void *ptr);
+void talloc_set_name(const void *ptr, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
+void talloc_set_name_const(const void *ptr, const char *name);
+void *talloc_named(const void *context, size_t size, 
+		   const char *fmt, ...) PRINTF_ATTRIBUTE(3,4);
+void *talloc_named_const(const void *context, size_t size, const char *name);
+const char *talloc_get_name(const void *ptr);
+void *talloc_check_name(const void *ptr, const char *name);
+void talloc_report_depth(const void *ptr, FILE *f, int depth);
+void *talloc_parent(const void *ptr);
+void *talloc_init(const char *fmt, ...) PRINTF_ATTRIBUTE(1,2);
+int talloc_free(void *ptr);
+void *_talloc_realloc(const void *context, void *ptr, size_t size, const char *name);
+void *talloc_steal(const void *new_ctx, const void *ptr);
+off_t talloc_total_size(const void *ptr);
+off_t talloc_total_blocks(const void *ptr);
+void talloc_report_full(const void *ptr, FILE *f);
+void talloc_report(const void *ptr, FILE *f);
+void talloc_enable_null_tracking(void);
+void talloc_enable_leak_report(void);
+void talloc_enable_leak_report_full(void);
+void *_talloc_zero(const void *ctx, size_t size, const char *name);
+void *_talloc_memdup(const void *t, const void *p, size_t size, const char *name);
+char *talloc_strdup(const void *t, const char *p);
+char *talloc_strndup(const void *t, const char *p, size_t n);
+char *talloc_vasprintf(const void *t, const char *fmt, va_list ap) PRINTF_ATTRIBUTE(2,0);
+char *talloc_asprintf(const void *t, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
+char *talloc_asprintf_append(char *s,
+			     const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
+void *_talloc_array(const void *ctx, size_t el_size, unsigned count, const char *name);
+void *_talloc_zero_array(const void *ctx, size_t el_size, unsigned count, const char *name);
+void *_talloc_realloc_array(const void *ctx, void *ptr, size_t el_size, unsigned count, const char *name);
+void *talloc_realloc_fn(const void *context, void *ptr, size_t size);
+void *talloc_autofree_context(void);
+size_t talloc_get_size(const void *ctx);
+
+typedef int talloc_fail_handler(void *);
+talloc_fail_handler *talloc_set_fail_handler(talloc_fail_handler *, void *);
+#endif
+
diff --git a/tools/xenstore/talloc_guide.txt b/tools/xenstore/talloc_guide.txt
new file mode 100644
index 0000000000..c23ac77cad
--- /dev/null
+++ b/tools/xenstore/talloc_guide.txt
@@ -0,0 +1,569 @@
+Using talloc in Samba4
+----------------------
+
+Andrew Tridgell
+September 2004
+
+The most current version of this document is available at
+   http://samba.org/ftp/unpacked/samba4/source/lib/talloc/talloc_guide.txt
+
+If you are used to talloc from Samba3 then please read this carefully,
+as talloc has changed a lot.
+
+The new talloc is a hierarchical, reference counted memory pool system
+with destructors. Quite a mounthful really, but not too bad once you
+get used to it.
+
+Perhaps the biggest change from Samba3 is that there is no distinction
+between a "talloc context" and a "talloc pointer". Any pointer
+returned from talloc() is itself a valid talloc context. This means
+you can do this:
+
+  struct foo *X = talloc(mem_ctx, struct foo);
+  X->name = talloc_strdup(X, "foo");
+
+and the pointer X->name would be a "child" of the talloc context "X"
+which is itself a child of mem_ctx. So if you do talloc_free(mem_ctx)
+then it is all destroyed, whereas if you do talloc_free(X) then just X
+and X->name are destroyed, and if you do talloc_free(X->name) then
+just the name element of X is destroyed.
+
+If you think about this, then what this effectively gives you is an
+n-ary tree, where you can free any part of the tree with
+talloc_free().
+
+If you find this confusing, then I suggest you run the testsuite to
+watch talloc in action. You may also like to add your own tests to
+testsuite.c to clarify how some particular situation is handled.
+
+
+Performance
+-----------
+
+All the additional features of talloc() over malloc() do come at a
+price. We have a simple performance test in Samba4 that measures
+talloc() versus malloc() performance, and it seems that talloc() is
+about 10% slower than malloc() on my x86 Debian Linux box. For Samba,
+the great reduction in code complexity that we get by using talloc
+makes this worthwhile, especially as the total overhead of
+talloc/malloc in Samba is already quite small.
+
+
+talloc API
+----------
+
+The following is a complete guide to the talloc API. Read it all at
+least twice.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(type *)talloc(const void *context, type);
+
+The talloc() macro is the core of the talloc library. It takes a
+memory context and a type, and returns a pointer to a new area of
+memory of the given type.
+
+The returned pointer is itself a talloc context, so you can use it as
+the context argument to more calls to talloc if you wish.
+
+The returned pointer is a "child" of the supplied context. This means
+that if you talloc_free() the context then the new child disappears as
+well. Alternatively you can free just the child.
+
+The context argument to talloc() can be NULL, in which case a new top
+level context is created. 
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_size(const void *context, size_t size);
+
+The function talloc_size() should be used when you don't have a
+convenient type to pass to talloc(). Unlike talloc(), it is not type
+safe (as it returns a void *), so you are on your own for type checking.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+int talloc_free(void *ptr);
+
+The talloc_free() function frees a piece of talloc memory, and all its
+children. You can call talloc_free() on any pointer returned by
+talloc().
+
+The return value of talloc_free() indicates success or failure, with 0
+returned for success and -1 for failure. The only possible failure
+condition is if the pointer had a destructor attached to it and the
+destructor returned -1. See talloc_set_destructor() for details on
+destructors.
+
+If this pointer has an additional parent when talloc_free() is called
+then the memory is not actually released, but instead the most
+recently established parent is destroyed. See talloc_reference() for
+details on establishing additional parents.
+
+For more control on which parent is removed, see talloc_unlink()
+
+talloc_free() operates recursively on its children.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+int talloc_free_children(void *ptr);
+
+The talloc_free_children() walks along the list of all children of a
+talloc context and talloc_free()s only the children, not the context
+itself.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_reference(const void *context, const void *ptr);
+
+The talloc_reference() function makes "context" an additional parent
+of "ptr".
+
+The return value of talloc_reference() is always the original pointer
+"ptr", unless talloc ran out of memory in creating the reference in
+which case it will return NULL (each additional reference consumes
+around 48 bytes of memory on intel x86 platforms).
+
+If "ptr" is NULL, then the function is a no-op, and simply returns NULL.
+
+After creating a reference you can free it in one of the following
+ways:
+
+  - you can talloc_free() any parent of the original pointer. That
+    will reduce the number of parents of this pointer by 1, and will
+    cause this pointer to be freed if it runs out of parents.
+
+  - you can talloc_free() the pointer itself. That will destroy the
+    most recently established parent to the pointer and leave the
+    pointer as a child of its current parent.
+
+For more control on which parent to remove, see talloc_unlink()
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+int talloc_unlink(const void *context, const void *ptr);
+
+The talloc_unlink() function removes a specific parent from ptr. The
+context passed must either be a context used in talloc_reference()
+with this pointer, or must be a direct parent of ptr. 
+
+Note that if the parent has already been removed using talloc_free()
+then this function will fail and will return -1.  Likewise, if "ptr"
+is NULL, then the function will make no modifications and return -1.
+
+Usually you can just use talloc_free() instead of talloc_unlink(), but
+sometimes it is useful to have the additional control on which parent
+is removed.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_set_destructor(const void *ptr, int (*destructor)(void *));
+
+The function talloc_set_destructor() sets the "destructor" for the
+pointer "ptr". A destructor is a function that is called when the
+memory used by a pointer is about to be released. The destructor
+receives the pointer as an argument, and should return 0 for success
+and -1 for failure.
+
+The destructor can do anything it wants to, including freeing other
+pieces of memory. A common use for destructors is to clean up
+operating system resources (such as open file descriptors) contained
+in the structure the destructor is placed on.
+
+You can only place one destructor on a pointer. If you need more than
+one destructor then you can create a zero-length child of the pointer
+and place an additional destructor on that.
+
+To remove a destructor call talloc_set_destructor() with NULL for the
+destructor.
+
+If your destructor attempts to talloc_free() the pointer that it is
+the destructor for then talloc_free() will return -1 and the free will
+be ignored. This would be a pointless operation anyway, as the
+destructor is only called when the memory is just about to go away.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_increase_ref_count(const void *ptr);
+
+The talloc_increase_ref_count(ptr) function is exactly equivalent to:
+
+  talloc_reference(NULL, ptr);
+
+You can use either syntax, depending on which you think is clearer in
+your code.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_set_name(const void *ptr, const char *fmt, ...);
+
+Each talloc pointer has a "name". The name is used principally for
+debugging purposes, although it is also possible to set and get the
+name on a pointer in as a way of "marking" pointers in your code.
+
+The main use for names on pointer is for "talloc reports". See
+talloc_report() and talloc_report_full() for details. Also see
+talloc_enable_leak_report() and talloc_enable_leak_report_full().
+
+The talloc_set_name() function allocates memory as a child of the
+pointer. It is logically equivalent to:
+  talloc_set_name_const(ptr, talloc_asprintf(ptr, fmt, ...));
+
+Note that multiple calls to talloc_set_name() will allocate more
+memory without releasing the name. All of the memory is released when
+the ptr is freed using talloc_free().
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_set_name_const(const void *ptr, const char *name);
+
+The function talloc_set_name_const() is just like talloc_set_name(),
+but it takes a string constant, and is much faster. It is extensively
+used by the "auto naming" macros, such as talloc_p().
+
+This function does not allocate any memory. It just copies the
+supplied pointer into the internal representation of the talloc
+ptr. This means you must not pass a name pointer to memory that will
+disappear before the ptr is freed with talloc_free().
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_named(const void *context, size_t size, const char *fmt, ...);
+
+The talloc_named() function creates a named talloc pointer. It is
+equivalent to:
+
+   ptr = talloc_size(context, size);
+   talloc_set_name(ptr, fmt, ....);
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_named_const(const void *context, size_t size, const char *name);
+
+This is equivalent to:
+
+   ptr = talloc_size(context, size);
+   talloc_set_name_const(ptr, name);
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+const char *talloc_get_name(const void *ptr);
+
+This returns the current name for the given talloc pointer. See
+talloc_set_name() for details.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_init(const char *fmt, ...);
+
+This function creates a zero length named talloc context as a top
+level context. It is equivalent to:
+
+  talloc_named(NULL, 0, fmt, ...);
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_new(void *ctx);
+
+This is a utility macro that creates a new memory context hanging
+off an exiting context, automatically naming it "talloc_new: __location__"
+where __location__ is the source line it is called from. It is
+particularly useful for creating a new temporary working context.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(type *)talloc_realloc(const void *context, void *ptr, type, count);
+
+The talloc_realloc() macro changes the size of a talloc
+pointer. The "count" argument is the number of elements of type "type"
+that you want the resulting pointer to hold. 
+
+talloc_realloc() has the following equivalences:
+
+  talloc_realloc(context, NULL, type, 1) ==> talloc(context, type);
+  talloc_realloc(context, NULL, type, N) ==> talloc_array(context, type, N);
+  talloc_realloc(context, ptr, type, 0)  ==> talloc_free(ptr);
+
+The "context" argument is only used if "ptr" is not NULL, otherwise it
+is ignored.
+
+talloc_realloc() returns the new pointer, or NULL on failure. The call
+will fail either due to a lack of memory, or because the pointer has
+more than one parent (see talloc_reference()).
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_realloc_size(const void *context, void *ptr, size_t size);
+
+the talloc_realloc_size() function is useful when the type is not 
+known so the typesafe talloc_realloc() cannot be used.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_steal(const void *new_ctx, const void *ptr);
+
+The talloc_steal() function changes the parent context of a talloc
+pointer. It is typically used when the context that the pointer is
+currently a child of is going to be freed and you wish to keep the
+memory for a longer time. 
+
+The talloc_steal() function returns the pointer that you pass it. It
+does not have any failure modes.
+
+NOTE: It is possible to produce loops in the parent/child relationship
+if you are not careful with talloc_steal(). No guarantees are provided
+as to your sanity or the safety of your data if you do this.
+
+talloc_steal (new_ctx, NULL) will return NULL with no sideeffects.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+off_t talloc_total_size(const void *ptr);
+
+The talloc_total_size() function returns the total size in bytes used
+by this pointer and all child pointers. Mostly useful for debugging.
+
+Passing NULL is allowed, but it will only give a meaningful result if
+talloc_enable_leak_report() or talloc_enable_leak_report_full() has
+been called.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+off_t talloc_total_blocks(const void *ptr);
+
+The talloc_total_blocks() function returns the total memory block
+count used by this pointer and all child pointers. Mostly useful for
+debugging.
+
+Passing NULL is allowed, but it will only give a meaningful result if
+talloc_enable_leak_report() or talloc_enable_leak_report_full() has
+been called.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_report(const void *ptr, FILE *f);
+
+The talloc_report() function prints a summary report of all memory
+used by ptr. One line of report is printed for each immediate child of
+ptr, showing the total memory and number of blocks used by that child.
+
+You can pass NULL for the pointer, in which case a report is printed
+for the top level memory context, but only if
+talloc_enable_leak_report() or talloc_enable_leak_report_full() has
+been called.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_report_full(const void *ptr, FILE *f);
+
+This provides a more detailed report than talloc_report(). It will
+recursively print the ensire tree of memory referenced by the
+pointer. References in the tree are shown by giving the name of the
+pointer that is referenced.
+
+You can pass NULL for the pointer, in which case a report is printed
+for the top level memory context, but only if
+talloc_enable_leak_report() or talloc_enable_leak_report_full() has
+been called.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_enable_leak_report(void);
+
+This enables calling of talloc_report(NULL, stderr) when the program
+exits. In Samba4 this is enabled by using the --leak-report command
+line option.
+
+For it to be useful, this function must be called before any other
+talloc function as it establishes a "null context" that acts as the
+top of the tree. If you don't call this function first then passing
+NULL to talloc_report() or talloc_report_full() won't give you the
+full tree printout.
+
+Here is a typical talloc report:
+
+talloc report on 'null_context' (total 267 bytes in 15 blocks)
+        libcli/auth/spnego_parse.c:55  contains     31 bytes in   2 blocks
+        libcli/auth/spnego_parse.c:55  contains     31 bytes in   2 blocks
+        iconv(UTF8,CP850)              contains     42 bytes in   2 blocks
+        libcli/auth/spnego_parse.c:55  contains     31 bytes in   2 blocks
+        iconv(CP850,UTF8)              contains     42 bytes in   2 blocks
+        iconv(UTF8,UTF-16LE)           contains     45 bytes in   2 blocks
+        iconv(UTF-16LE,UTF8)           contains     45 bytes in   2 blocks
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_enable_leak_report_full(void);
+
+This enables calling of talloc_report_full(NULL, stderr) when the
+program exits. In Samba4 this is enabled by using the
+--leak-report-full command line option.
+
+For it to be useful, this function must be called before any other
+talloc function as it establishes a "null context" that acts as the
+top of the tree. If you don't call this function first then passing
+NULL to talloc_report() or talloc_report_full() won't give you the
+full tree printout.
+
+Here is a typical full report:
+
+full talloc report on 'root' (total 18 bytes in 8 blocks)
+    p1                             contains     18 bytes in   7 blocks (ref 0)
+        r1                             contains     13 bytes in   2 blocks (ref 0)
+            reference to: p2
+        p2                             contains      1 bytes in   1 blocks (ref 1)
+        x3                             contains      1 bytes in   1 blocks (ref 0)
+        x2                             contains      1 bytes in   1 blocks (ref 0)
+        x1                             contains      1 bytes in   1 blocks (ref 0)
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_enable_null_tracking(void);
+
+This enables tracking of the NULL memory context without enabling leak
+reporting on exit. Useful for when you want to do your own leak
+reporting call via talloc_report_null_full();
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(type *)talloc_zero(const void *ctx, type);
+
+The talloc_zero() macro is equivalent to:
+
+  ptr = talloc(ctx, type);
+  if (ptr) memset(ptr, 0, sizeof(type));
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_zero_size(const void *ctx, size_t size)
+
+The talloc_zero_size() function is useful when you don't have a known type
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_memdup(const void *ctx, const void *p, size_t size);
+
+The talloc_memdup() function is equivalent to:
+
+  ptr = talloc_size(ctx, size);
+  if (ptr) memcpy(ptr, p, size);
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_strdup(const void *ctx, const char *p);
+
+The talloc_strdup() function is equivalent to:
+
+  ptr = talloc_size(ctx, strlen(p)+1);
+  if (ptr) memcpy(ptr, p, strlen(p)+1);
+
+This functions sets the name of the new pointer to the passed
+string. This is equivalent to:
+   talloc_set_name_const(ptr, ptr)
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_strndup(const void *t, const char *p, size_t n);
+
+The talloc_strndup() function is the talloc equivalent of the C
+library function strndup()
+
+This functions sets the name of the new pointer to the passed
+string. This is equivalent to:
+   talloc_set_name_const(ptr, ptr)
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_vasprintf(const void *t, const char *fmt, va_list ap);
+
+The talloc_vasprintf() function is the talloc equivalent of the C
+library function vasprintf()
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_asprintf(const void *t, const char *fmt, ...);
+
+The talloc_asprintf() function is the talloc equivalent of the C
+library function asprintf()
+
+This functions sets the name of the new pointer to the passed
+string. This is equivalent to:
+   talloc_set_name_const(ptr, ptr)
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_asprintf_append(char *s, const char *fmt, ...);
+
+The talloc_asprintf_append() function appends the given formatted 
+string to the given string. 
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(type *)talloc_array(const void *ctx, type, uint_t count);
+
+The talloc_array() macro is equivalent to:
+
+  (type *)talloc_size(ctx, sizeof(type) * count);
+
+except that it provides integer overflow protection for the multiply,
+returning NULL if the multiply overflows.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_array_size(const void *ctx, size_t size, uint_t count);
+
+The talloc_array_size() function is useful when the type is not
+known. It operates in the same way as talloc_array(), but takes a size
+instead of a type.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_realloc_fn(const void *ctx, void *ptr, size_t size);
+
+This is a non-macro version of talloc_realloc(), which is useful 
+as libraries sometimes want a ralloc function pointer. A realloc()
+implementation encapsulates the functionality of malloc(), free() and
+realloc() in one call, which is why it is useful to be able to pass
+around a single function pointer.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_autofree_context(void);
+
+This is a handy utility function that returns a talloc context
+which will be automatically freed on program exit. This can be used
+to reduce the noise in memory leak reports.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_check_name(const void *ptr, const char *name);
+
+This function checks if a pointer has the specified name. If it does
+then the pointer is returned. It it doesn't then NULL is returned.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(type *)talloc_get_type(const void *ptr, type);
+
+This macro allows you to do type checking on talloc pointers. It is
+particularly useful for void* private pointers. It is equivalent to
+this:
+
+   (type *)talloc_check_name(ptr, #type)
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+talloc_set_type(const void *ptr, type);
+
+This macro allows you to force the name of a pointer to be a
+particular type. This can be used in conjunction with
+talloc_get_type() to do type checking on void* pointers.
+
+It is equivalent to this:
+   talloc_set_name_const(ptr, #type)
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+talloc_get_size(const void *ctx);
+
+This function lets you know the amount of memory alloced so far by
+this context. It does NOT account for subcontext memory.
+This can be used to calculate the size of an array.
+
diff --git a/tools/xenstore/testsuite/01simple.sh b/tools/xenstore/testsuite/01simple.sh
new file mode 100644
index 0000000000..9b1eb8f5c3
--- /dev/null
+++ b/tools/xenstore/testsuite/01simple.sh
@@ -0,0 +1,4 @@
+#! /bin/sh
+
+# Create an entry, read it.
+[ "`echo -e 'write /test create contents\nread /test' | ./xs_test 2>&1`" = "contents" ]
diff --git a/tools/xenstore/testsuite/02directory.sh b/tools/xenstore/testsuite/02directory.sh
new file mode 100644
index 0000000000..f63ef1ff3d
--- /dev/null
+++ b/tools/xenstore/testsuite/02directory.sh
@@ -0,0 +1,31 @@
+#! /bin/sh
+
+# Root directory has nothing in it.
+[ "`echo -e 'dir /' | ./xs_test 2>&1`" = "" ]
+
+# Create a file.
+[ "`echo -e 'write /test create contents' | ./xs_test 2>&1`" = "" ]
+
+# Directory shows it.
+[ "`echo -e 'dir /' | ./xs_test 2>&1`" = "test" ]
+
+# Make a new directory.
+[ "`echo -e 'mkdir /dir' | ./xs_test 2>&1`" = "" ]
+
+# Check it's there.
+DIR="`echo -e 'dir /' | ./xs_test 2>&1`"
+[ "$DIR" = "test
+dir" ] || [ "$DIR" = "dir
+test" ]
+
+# Check it's empty.
+[ "`echo -e 'dir /dir' | ./xs_test 2>&1`" = "" ]
+
+# Create a file, check it exists.
+[ "`echo -e 'write /dir/test2 create contents2' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'dir /dir' | ./xs_test 2>&1`" = "test2" ]
+[ "`echo -e 'read /dir/test2' | ./xs_test 2>&1`" = "contents2" ]
+
+# Creating dir over the top should fail.
+[ "`echo -e 'mkdir /dir' | ./xs_test 2>&1`" = "FATAL: mkdir: File exists" ]
+[ "`echo -e 'mkdir /dir/test2' | ./xs_test 2>&1`" = "FATAL: mkdir: File exists" ]
diff --git a/tools/xenstore/testsuite/03write.sh b/tools/xenstore/testsuite/03write.sh
new file mode 100644
index 0000000000..cf5f897c54
--- /dev/null
+++ b/tools/xenstore/testsuite/03write.sh
@@ -0,0 +1,17 @@
+#! /bin/sh
+
+# Write without create fails.
+[ "`echo -e 'write /test none contents' | ./xs_test 2>&1`" = "FATAL: write: No such file or directory" ]
+
+# Exclusive write succeeds
+[ "`echo -e 'write /test excl contents' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'read /test' | ./xs_test 2>&1`" = "contents" ]
+
+# Exclusive write fails to overwrite.
+[ "`echo -e 'write /test excl contents' | ./xs_test 2>&1`" = "FATAL: write: File exists" ]
+
+# Non-exclusive overwrite succeeds.
+[ "`echo -e 'write /test none contents2' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'read /test' | ./xs_test 2>&1`" = "contents2" ]
+[ "`echo -e 'write /test create contents3' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'read /test' | ./xs_test 2>&1`" = "contents3" ]
diff --git a/tools/xenstore/testsuite/04rm.sh b/tools/xenstore/testsuite/04rm.sh
new file mode 100644
index 0000000000..abadd6110a
--- /dev/null
+++ b/tools/xenstore/testsuite/04rm.sh
@@ -0,0 +1,18 @@
+#! /bin/sh
+
+# Remove non-existant fails.
+[ "`echo -e 'rm /test' | ./xs_test 2>&1`" = "FATAL: rm: No such file or directory" ]
+[ "`echo -e 'rm /dir/test' | ./xs_test 2>&1`" = "FATAL: rm: No such file or directory" ]
+
+# Create file and remove it
+[ "`echo -e 'write /test excl contents' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'rm /test' | ./xs_test 2>&1`" = "" ]
+
+# Create directory and remove it.
+[ "`echo -e 'mkdir /dir' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'rm /dir' | ./xs_test 2>&1`" = "" ]
+
+# Create directory, create file, remove all.
+[ "`echo -e 'mkdir /dir' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'write /dir/test excl contents' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'rm /dir' | ./xs_test 2>&1`" = "" ]
diff --git a/tools/xenstore/testsuite/05filepermissions.sh b/tools/xenstore/testsuite/05filepermissions.sh
new file mode 100644
index 0000000000..9d9043f191
--- /dev/null
+++ b/tools/xenstore/testsuite/05filepermissions.sh
@@ -0,0 +1,49 @@
+#! /bin/sh
+
+# Fail to get perms on non-existent file.
+[ "`echo -e 'getperm /test' | ./xs_test 2>&1`" = "FATAL: getperm: No such file or directory" ]
+[ "`echo -e 'getperm /dir/test' | ./xs_test 2>&1`" = "FATAL: getperm: No such file or directory" ]
+
+# Create file: we own it, noone has access.
+[ "`echo -e 'write /test excl contents' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'getperm /test' | ./xs_test 2>&1`" = "0 NONE" ]
+[ "`echo -e 'setid 1\ngetperm /test' | ./xs_test 2>&1`" = "FATAL: getperm: Permission denied" ]
+[ "`echo -e 'setid 1\nread /test' | ./xs_test 2>&1`" = "FATAL: read: Permission denied" ]
+[ "`echo -e 'setid 1\nwrite /test none contents2' | ./xs_test 2>&1`" = "FATAL: write: Permission denied" ]
+
+# Grant everyone read access to file.
+[ "`echo -e 'setperm /test 0 READ' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'setid 1\ngetperm /test' | ./xs_test 2>&1`" = "0 READ" ]
+[ "`echo -e 'setid 1\nread /test' | ./xs_test 2>&1`" = "contents" ]
+[ "`echo -e 'setid 1\nwrite /test none contents2' | ./xs_test 2>&1`" = "FATAL: write: Permission denied" ]
+
+# Grant everyone write access to file.
+[ "`echo -e 'setperm /test 0 WRITE' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'setid 1\ngetperm /test' | ./xs_test 2>&1`" = "FATAL: getperm: Permission denied" ]
+[ "`echo -e 'setid 1\nread /test' | ./xs_test 2>&1`" = "FATAL: read: Permission denied" ]
+[ "`echo -e 'setid 1\nwrite /test none contents2' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'read /test' | ./xs_test 2>&1`" = "contents2" ]
+
+# Grant everyone both read and write access.
+[ "`echo -e 'setperm /test 0 READ/WRITE' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'setid 1\ngetperm /test' | ./xs_test 2>&1`" = "0 READ/WRITE" ]
+[ "`echo -e 'setid 1\nread /test' | ./xs_test 2>&1`" = "contents2" ]
+[ "`echo -e 'setid 1\nwrite /test none contents3' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'setid 1\nread /test' | ./xs_test 2>&1`" = "contents3" ]
+
+# Change so that user 1 owns it, noone else can do anything.
+[ "`echo -e 'setperm /test 1 NONE' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'setid 1\ngetperm /test' | ./xs_test 2>&1`" = "1 NONE" ]
+[ "`echo -e 'setid 1\nread /test' | ./xs_test 2>&1`" = "contents3" ]
+[ "`echo -e 'setid 1\nwrite /test none contents4' | ./xs_test 2>&1`" = "" ]
+
+# User 2 can do nothing.
+[ "`echo -e 'setid 2\nsetperm /test 2 NONE' | ./xs_test 2>&1`" = "FATAL: setperm: Permission denied" ]
+[ "`echo -e 'setid 2\ngetperm /test' | ./xs_test 2>&1`" = "FATAL: getperm: Permission denied" ]
+[ "`echo -e 'setid 2\nread /test' | ./xs_test 2>&1`" = "FATAL: read: Permission denied" ]
+[ "`echo -e 'setid 2\nwrite /test none contents4' | ./xs_test 2>&1`" = "FATAL: write: Permission denied" ]
+
+# Tools can always access things.
+[ "`echo -e 'getperm /test' | ./xs_test 2>&1`" = "1 NONE" ]
+[ "`echo -e 'read /test' | ./xs_test 2>&1`" = "contents4" ]
+[ "`echo -e 'write /test none contents5' | ./xs_test 2>&1`" = "" ]
diff --git a/tools/xenstore/testsuite/06dirpermissions.sh b/tools/xenstore/testsuite/06dirpermissions.sh
new file mode 100644
index 0000000000..922a794f04
--- /dev/null
+++ b/tools/xenstore/testsuite/06dirpermissions.sh
@@ -0,0 +1,61 @@
+#! /bin/sh
+
+# Root directory: owned by tool, everyone has read access.
+[ "`echo -e 'getperm /' | ./xs_test 2>&1`" = "0 READ" ]
+
+# Create directory: we own it, noone has access.
+[ "`echo -e 'mkdir /dir' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'getperm /dir' | ./xs_test 2>&1`" = "0 NONE" ]
+[ "`echo -e 'setid 1\ndir /dir' | ./xs_test 2>&1`" = "FATAL: dir: Permission denied" ]
+[ "`echo -e 'setid 1\nread /dir/test create contents2' | ./xs_test 2>&1`" = "FATAL: read: Permission denied" ]
+[ "`echo -e 'setid 1\nwrite /dir/test create contents2' | ./xs_test 2>&1`" = "FATAL: write: Permission denied" ]
+
+# Grant everyone read access to directoy.
+[ "`echo -e 'setperm /dir 0 READ' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'setid 1\ngetperm /dir' | ./xs_test 2>&1`" = "0 READ" ]
+[ "`echo -e 'setid 1\ndir /dir' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'setid 1\nwrite /dir/test create contents2' | ./xs_test 2>&1`" = "FATAL: write: Permission denied" ]
+
+# Grant everyone write access to directory.
+[ "`echo -e 'setperm /dir 0 WRITE' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'setid 1\ngetperm /dir' | ./xs_test 2>&1`" = "FATAL: getperm: Permission denied" ]
+[ "`echo -e 'setid 1\ndir /dir' | ./xs_test 2>&1`" = "FATAL: dir: Permission denied" ]
+[ "`echo -e 'setid 1\nwrite /dir/test create contents' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'read /dir/test' | ./xs_test 2>&1`" = "contents" ]
+
+# Grant everyone both read and write access.
+[ "`echo -e 'setperm /dir 0 READ/WRITE' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'setid 1\ngetperm /dir' | ./xs_test 2>&1`" = "0 READ/WRITE" ]
+[ "`echo -e 'setid 1\ndir /dir' | ./xs_test 2>&1`" = "test" ]
+[ "`echo -e 'setid 1\nwrite /dir/test2 create contents' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'setid 1\nread /dir/test2' | ./xs_test 2>&1`" = "contents" ]
+
+# Change so that user 1 owns it, noone else can do anything.
+[ "`echo -e 'setperm /dir 1 NONE' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e 'setid 1\ngetperm /dir' | ./xs_test 2>&1`" = "1 NONE" ]
+[ "`echo -e 'setid 1\ndir /dir' | ./xs_test 2>&1 | sort`" = "test
+test2" ]
+[ "`echo -e 'setid 1\nwrite /dir/test3 create contents' | ./xs_test 2>&1`" = "" ]
+
+# User 2 can do nothing.  Can't even tell if file exists.
+[ "`echo -e 'setid 2\nsetperm /dir 2 NONE' | ./xs_test 2>&1`" = "FATAL: setperm: Permission denied" ]
+[ "`echo -e 'setid 2\ngetperm /dir' | ./xs_test 2>&1`" = "FATAL: getperm: Permission denied" ]
+[ "`echo -e 'setid 2\ndir /dir' | ./xs_test 2>&1`" = "FATAL: dir: Permission denied" ]
+[ "`echo -e 'setid 2\nread /dir/test' | ./xs_test 2>&1`" = "FATAL: read: Permission denied" ]
+[ "`echo -e 'setid 2\nread /dir/test2' | ./xs_test 2>&1`" = "FATAL: read: Permission denied" ]
+[ "`echo -e 'setid 2\nread /dir/test3' | ./xs_test 2>&1`" = "FATAL: read: Permission denied" ]
+[ "`echo -e 'setid 2\nread /dir/test4' | ./xs_test 2>&1`" = "FATAL: read: Permission denied" ]
+[ "`echo -e 'setid 2\nwrite /dir/test none contents' | ./xs_test 2>&1`" = "FATAL: write: Permission denied" ]
+[ "`echo -e 'setid 2\nwrite /dir/test create contents' | ./xs_test 2>&1`" = "FATAL: write: Permission denied" ]
+[ "`echo -e 'setid 2\nwrite /dir/test excl contents' | ./xs_test 2>&1`" = "FATAL: write: Permission denied" ]
+[ "`echo -e 'setid 2\nwrite /dir/test4 none contents' | ./xs_test 2>&1`" = "FATAL: write: Permission denied" ]
+[ "`echo -e 'setid 2\nwrite /dir/test4 create contents' | ./xs_test 2>&1`" = "FATAL: write: Permission denied" ]
+[ "`echo -e 'setid 2\nwrite /dir/test4 excl contents' | ./xs_test 2>&1`" = "FATAL: write: Permission denied" ]
+
+# Tools can always access things.
+[ "`echo -e 'getperm /dir' | ./xs_test 2>&1`" = "1 NONE" ]
+[ "`echo -e 'dir /dir' | ./xs_test 2>&1 | sort`" = "test
+test2
+test3" ]
+[ "`echo -e 'write /dir/test4 create contents' | ./xs_test 2>&1`" = "" ]
+
diff --git a/tools/xenstore/testsuite/07watch.sh b/tools/xenstore/testsuite/07watch.sh
new file mode 100644
index 0000000000..bedce6ad5b
--- /dev/null
+++ b/tools/xenstore/testsuite/07watch.sh
@@ -0,0 +1,32 @@
+#! /bin/sh
+
+# Watch something, write to it, check watch has fired.
+[ "`echo -e 'write /test create contents' | ./xs_test 2>&1`" = "" ]
+
+[ "`echo -e '1 watch /test 100\n2 write /test create contents2\n1 waitwatch\n1 ackwatch' | ./xs_test 2>&1`" = "1:/test" ]
+
+# Check that reads don't set it off.
+[ "`echo -e '1 watch /test 100\n2 read /test\n1 waitwatch' | ./xs_test 2>&1`" = "2:contents2
+1:waitwatch timeout" ]
+
+# mkdir, setperm and rm should (also /tests watching dirs)
+[ "`echo -e 'mkdir /dir' | ./xs_test 2>&1`" = "" ]
+[ "`echo -e '1 watch /dir 100\n2 mkdir /dir/newdir\n1 waitwatch\n1 ackwatch\n2 setperm /dir/newdir 0 READ\n1 waitwatch\n1 ackwatch\n2 rm /dir/newdir\n1 waitwatch\n1 ackwatch' | ./xs_test 2>&1`" = "1:/dir/newdir
+1:/dir/newdir
+1:/dir/newdir" ]
+
+# ignore watches while doing commands, should work.
+[ "`echo -e 'watch /dir 100\nwrite /dir/test create contents\nread /dir/test\nwaitwatch\nackwatch' | ./xs_test 2>&1`" = "contents
+/dir/test" ]
+
+# watch priority /test.
+[ "`echo -e '1 watch /dir 1\n3 watch /dir 3\n2 watch /dir 2\nwrite /dir/test create contents\n3 waitwatch\n3 ackwatch\n2 waitwatch\n2 ackwatch\n1 waitwatch\n1 ackwatch' | ./xs_test 2>&1`" = "3:/dir/test
+2:/dir/test
+1:/dir/test" ]
+
+# If one dies (without acking), the other should still get ack.
+[ "`echo -e '1 watch /dir 0\n2 watch /dir 1\nwrite /dir/test create contents\n2 waitwatch\n2 close\n1 waitwatch\n1 ackwatch' | ./xs_test 2>&1`" = "2:/dir/test
+1:/dir/test" ]
+
+# If one dies (without reading at all), the other should still get ack.
+[ "`echo -e '1 watch /dir 0\n2 watch /dir 1\nwrite /dir/test create contents\n2 close\n1 waitwatch\n1 ackwatch' | ./xs_test 2>&1`" = "1:/dir/test" ]
diff --git a/tools/xenstore/testsuite/08transaction.sh b/tools/xenstore/testsuite/08transaction.sh
new file mode 100644
index 0000000000..2c23ed2496
--- /dev/null
+++ b/tools/xenstore/testsuite/08transaction.sh
@@ -0,0 +1,54 @@
+#! /bin/sh
+# Test transactions.
+
+# Simple transaction: create a file inside transaction.
+[ "`echo -e '1 start /
+1 write /entry1 create contents
+2 dir /
+1 dir /
+1 commit
+2 read /entry1' | ./xs_test`" = "1:entry1
+2:contents" ]
+echo rm /entry1 | ./xs_test
+
+# Create a file and abort transaction.
+[ "`echo -e '1 start /
+1 write /entry1 create contents
+2 dir /
+1 dir /
+1 abort
+2 dir /' | ./xs_test`" = "1:entry1" ]
+
+echo write /entry1 create contents | ./xs_test
+# Delete in transaction, commit
+[ "`echo -e '1 start /
+1 rm /entry1
+2 dir /
+1 dir /
+1 commit
+2 dir /' | ./xs_test`" = "2:entry1" ]
+
+# Delete in transaction, abort.
+echo write /entry1 create contents | ./xs_test
+[ "`echo -e '1 start /
+1 rm /entry1
+2 dir /
+1 dir /
+1 abort
+2 dir /' | ./xs_test`" = "2:entry1
+2:entry1" ]
+
+# Transactions can take as long as the want...
+[ "`echo -e 'start /
+sleep 1
+rm /entry1
+commit
+dir /' | ./xs_test`" = "" ]
+
+# ... as long as noone is waiting.
+[ "`echo -e '1 start /
+2 mkdir /dir
+1 mkdir /dir
+1 dir /
+1 commit' | ./xs_test 2>&1`" = "1:dir
+FATAL: 1: commit: Connection timed out" ]
diff --git a/tools/xenstore/testsuite/09domain.sh b/tools/xenstore/testsuite/09domain.sh
new file mode 100644
index 0000000000..9208dda0ec
--- /dev/null
+++ b/tools/xenstore/testsuite/09domain.sh
@@ -0,0 +1,15 @@
+#! /bin/sh
+# Test domain communication.
+
+# Create a domain, write an entry.
+[ "`echo -e 'introduce 1 100 7 /my/home
+1 write /entry1 create contents
+dir /' | ./xs_test 2>&1`" = "handle is 1
+entry1" ]
+
+# Release that domain.
+[ "`echo -e 'release 1' | ./xs_test`" = "" ]
+
+# Introduce and release by same connection.
+[ "`echo -e 'introduce 1 100 7 /my/home
+release 1' | ./xs_test 2>&1`" = "handle is 1" ]
diff --git a/tools/xenstore/testsuite/test.sh b/tools/xenstore/testsuite/test.sh
new file mode 100755
index 0000000000..5718e84a15
--- /dev/null
+++ b/tools/xenstore/testsuite/test.sh
@@ -0,0 +1,44 @@
+#! /bin/sh
+
+set -e
+set -m
+
+run_test()
+{
+    rm -rf $XENSTORED_ROOTDIR
+    mkdir $XENSTORED_ROOTDIR
+# Weird failures with this.
+    if type valgrind >/dev/null 2>&1; then
+	valgrind -q --logfile-fd=3 ./xenstored_test --output-pid --no-fork 3>testsuite/tmp/vgout > /tmp/pid &
+	while [ ! -s /tmp/pid ]; do sleep 0; done
+	PID=`cat /tmp/pid`
+	rm /tmp/pid
+    else
+	PID=`./xenstored_test --output-pid`
+    fi
+    if sh -e $2 $1; then
+	if [ -s testsuite/tmp/vgout ]; then
+	    kill $PID
+	    echo VALGRIND errors:
+	    cat testsuite/tmp/vgout
+	    return 1
+	fi
+	echo shutdown | ./xs_test
+	return 0
+    else
+	# In case daemon is wedged.
+	kill $PID
+	sleep 1
+	return 1
+    fi
+}
+
+for f in testsuite/[0-9]*.sh; do
+    if run_test $f; then
+	echo Test $f passed...
+    else
+	echo Test $f failed, running verbosely...
+	run_test $f -x
+	exit 1
+    fi
+done
diff --git a/tools/xenstore/utils.c b/tools/xenstore/utils.c
new file mode 100644
index 0000000000..2345021f70
--- /dev/null
+++ b/tools/xenstore/utils.c
@@ -0,0 +1,143 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <signal.h>
+
+#include "utils.h"
+
+void xprintf(const char *fmt, ...)
+{
+        static FILE *out = NULL;
+        va_list args;
+        if (!out)
+                out = fopen("/dev/console", "w");
+	if (!out)
+		out = stderr;
+
+        va_start(args, fmt);
+        vfprintf(out, fmt, args);
+        va_end(args);
+        fflush(out);
+}
+
+void barf(const char *fmt, ...)
+{
+	char *str;
+	va_list arglist;
+
+	xprintf("FATAL: ");
+
+	va_start(arglist, fmt);
+	vasprintf(&str, fmt, arglist);
+	va_end(arglist);
+
+	xprintf("%s\n", str);
+	free(str);
+	exit(1);
+}
+
+void barf_perror(const char *fmt, ...)
+{
+	char *str;
+	int err = errno;
+	va_list arglist;
+
+	xprintf("FATAL: ");
+
+	va_start(arglist, fmt);
+	vasprintf(&str, fmt, arglist);
+	va_end(arglist);
+
+	xprintf("%s: %s\n", str, strerror(err));
+	free(str);
+	exit(1);
+}
+
+void *_realloc_array(void *ptr, size_t size, size_t num)
+{
+        if (num >= SIZE_MAX/size)
+                return NULL;
+        return realloc_nofail(ptr, size * num);
+}
+
+void *realloc_nofail(void *ptr, size_t size)
+{
+        ptr = realloc(ptr, size);
+	if (ptr)
+		return ptr;
+	barf("realloc of %zu failed", size);
+}
+
+void *malloc_nofail(size_t size)
+{
+	void *ptr = malloc(size);
+	if (ptr)
+		return ptr;
+	barf("malloc of %zu failed", size);
+}
+
+/* Stevens. */
+void daemonize(void)
+{
+	pid_t pid;
+
+	/* Separate from our parent via fork, so init inherits us. */
+	if ((pid = fork()) < 0)
+		barf_perror("Failed to fork daemon");
+	if (pid != 0)
+		exit(0);
+
+	close(STDIN_FILENO);
+	close(STDOUT_FILENO);
+	close(STDERR_FILENO);
+
+	/* Session leader so ^C doesn't whack us. */
+	setsid();
+	/* Move off any mount points we might be in. */
+	chdir("/");
+	/* Discard our parent's old-fashioned umask prejudices. */
+	umask(0);
+}
+
+
+/* This version adds one byte (for nul term) */
+void *grab_file(const char *filename, unsigned long *size)
+{
+	unsigned int max = 16384;
+	int ret, fd;
+	void *buffer;
+
+	if (streq(filename, "-"))
+		fd = dup(STDIN_FILENO);
+	else
+		fd = open(filename, O_RDONLY, 0);
+
+	if (fd < 0)
+		return NULL;
+
+	buffer = malloc(max+1);
+	*size = 0;
+	while ((ret = read(fd, buffer + *size, max - *size)) > 0) {
+		*size += ret;
+		if (*size == max)
+			buffer = realloc(buffer, max *= 2 + 1);
+	}
+	if (ret < 0) {
+		free(buffer);
+		buffer = NULL;
+	} else
+		((char *)buffer)[*size] = '\0';
+	close(fd);
+	return buffer;
+}
+
+void release_file(void *data, unsigned long size __attribute__((unused)))
+{
+	free(data);
+}
diff --git a/tools/xenstore/utils.h b/tools/xenstore/utils.h
new file mode 100644
index 0000000000..a84f19a22a
--- /dev/null
+++ b/tools/xenstore/utils.h
@@ -0,0 +1,61 @@
+#ifndef _UTILS_H
+#define _UTILS_H
+#include <stdbool.h>
+#include <string.h>
+#include <stdint.h>
+
+/* Is A == B ? */
+#define streq(a,b) (strcmp((a),(b)) == 0)
+
+/* Does A start with B ? */
+#define strstarts(a,b) (strncmp((a),(b),strlen(b)) == 0)
+
+/* Does A end in B ? */
+static inline bool strends(const char *a, const char *b)
+{
+	if (strlen(a) < strlen(b))
+		return false;
+
+	return streq(a + strlen(a) - strlen(b), b);
+}
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+#define ___stringify(x)	#x
+#define __stringify(x)		___stringify(x)
+
+/* Convenient wrappers for malloc and realloc.  Use them. */
+#define new(type) ((type *)malloc_nofail(sizeof(type)))
+#define new_array(type, num) realloc_array((type *)0, (num))
+#define realloc_array(ptr, num) ((__typeof__(ptr))_realloc_array((ptr), sizeof((*ptr)), (num)))
+
+void *malloc_nofail(size_t size);
+void *realloc_nofail(void *ptr, size_t size);
+void *_realloc_array(void *ptr, size_t size, size_t num);
+
+void barf(const char *fmt, ...) __attribute__((noreturn));
+void barf_perror(const char *fmt, ...) __attribute__((noreturn));
+
+/* This version adds one byte (for nul term) */
+void *grab_file(const char *filename, unsigned long *size);
+void release_file(void *data, unsigned long size);
+
+/* For writing daemons, based on Stevens. */
+void daemonize(void);
+
+/* Signal handling: returns fd to listen on. */
+int signal_to_fd(int signal);
+void close_signal(int fd);
+
+void xprintf(const char *fmt, ...);
+
+#define eprintf(_fmt, _args...) xprintf("[ERR] %s" _fmt, __FUNCTION__, ##_args)
+#define iprintf(_fmt, _args...) xprintf("[INF] %s" _fmt, __FUNCTION__, ##_args)
+
+#ifdef DEBUG
+#define dprintf(_fmt, _args...) xprintf("[DBG] %s" _fmt, __FUNCTION__, ##_args)
+#else
+#define dprintf(_fmt, _args...) ((void)0)
+#endif
+
+#endif /* _UTILS_H */
diff --git a/tools/xenstore/xenstored.h b/tools/xenstore/xenstored.h
new file mode 100644
index 0000000000..784ec987a8
--- /dev/null
+++ b/tools/xenstore/xenstored.h
@@ -0,0 +1,81 @@
+/* 
+    Simple prototyle Xen Store Daemon providing simple tree-like database.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#ifndef _XENSTORED_H
+#define _XENSTORED_H
+
+enum xsd_sockmsg_type
+{
+	XS_DEBUG,
+	XS_SHUTDOWN,
+	XS_DIRECTORY,
+	XS_READ,
+	XS_GET_PERMS,
+	XS_WATCH,
+	XS_WATCH_ACK,
+	XS_UNWATCH,
+	XS_TRANSACTION_START,
+	XS_TRANSACTION_END,
+	XS_OP_READ_ONLY = XS_TRANSACTION_END,
+	XS_INTRODUCE,
+	XS_RELEASE,
+	XS_GETDOMAINPATH,
+	XS_WRITE,
+	XS_MKDIR,
+	XS_RM,
+	XS_SET_PERMS,
+	XS_WATCH_EVENT,
+	XS_ERROR,
+};
+
+#define XS_WRITE_NONE "NONE"
+#define XS_WRITE_CREATE "CREATE"
+#define XS_WRITE_CREATE_EXCL "CREATE|EXCL"
+
+/* We hand errors as strings, for portability. */
+struct xsd_errors
+{
+	int errnum;
+	const char *errstring;
+};
+#define XSD_ERROR(x) { x, #x }
+static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
+	XSD_ERROR(EINVAL),
+	XSD_ERROR(EACCES),
+	XSD_ERROR(EEXIST),
+	XSD_ERROR(EISDIR),
+	XSD_ERROR(ENOENT),
+	XSD_ERROR(ENOMEM),
+	XSD_ERROR(ENOSPC),
+	XSD_ERROR(EIO),
+	XSD_ERROR(ENOTEMPTY),
+	XSD_ERROR(ENOSYS),
+	XSD_ERROR(EROFS),
+	XSD_ERROR(EBUSY),
+	XSD_ERROR(ETIMEDOUT),
+	XSD_ERROR(EISCONN),
+};
+struct xsd_sockmsg
+{
+	u32 type;
+	u32 len; 		/* Length of data following this. */
+
+	/* Generally followed by nul-terminated string(s). */
+};
+
+#endif /* _XENSTORED_H */
diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
new file mode 100644
index 0000000000..9d15848463
--- /dev/null
+++ b/tools/xenstore/xenstored_core.c
@@ -0,0 +1,1354 @@
+/* 
+    Simple prototype Xen Store Daemon providing simple tree-like database.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/un.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <syslog.h>
+#include <string.h>
+#include <errno.h>
+#include <dirent.h>
+#include <getopt.h>
+#include <signal.h>
+#include <assert.h>
+#include <setjmp.h>
+
+//#define DEBUG
+#include "utils.h"
+#include "list.h"
+#include "talloc.h"
+#include "xs_lib.h"
+#include "xenstored.h"
+#include "xenstored_core.h"
+#include "xenstored_watch.h"
+#include "xenstored_transaction.h"
+#include "xenstored_domain.h"
+
+static bool verbose;
+static LIST_HEAD(connections);
+
+#ifdef TESTING
+static bool failtest = false;
+
+/* We override talloc's malloc. */
+void *test_malloc(size_t size)
+{
+	/* 1 in 20 means only about 50% of connections establish. */
+	if (failtest && (random() % 32) == 0)
+		return NULL;
+	return malloc(size);
+}
+
+static void stop_failtest(int signum __attribute__((unused)))
+{
+	failtest = false;
+}
+
+/* Need these before we #define away write_all/mkdir in testing.h */
+bool test_write_all(int fd, void *contents, unsigned int len);
+bool test_write_all(int fd, void *contents, unsigned int len)
+{
+	if (failtest && (random() % 8) == 0) {
+		if (len)
+			len = random() % len;
+		write(fd, contents, len);
+		errno = ENOSPC;
+		return false;
+	}
+	return write_all(fd, contents, len);
+}
+
+int test_mkdir(const char *dir, int perms);
+int test_mkdir(const char *dir, int perms)
+{
+	if (failtest && (random() % 8) == 0) {
+		errno = ENOSPC;
+		return -1;
+	}
+	return mkdir(dir, perms);
+}
+#endif /* TESTING */
+
+#include "xenstored_test.h"
+
+/* FIXME: Ideally, this should never be called.  Some can be eliminated. */
+/* Something is horribly wrong: shutdown immediately. */
+void __attribute__((noreturn)) corrupt(struct connection *conn,
+				       const char *fmt, ...)
+{
+	va_list arglist;
+	char *str;
+	int saved_errno = errno;
+
+	va_start(arglist, fmt);
+	str = talloc_vasprintf(NULL, fmt, arglist);
+	va_end(arglist);
+
+	eprintf("xenstored corruption: connection id %i: err %s: %s",
+		conn ? (int)conn->id : -1, strerror(saved_errno), str);
+#ifdef TESTING
+	/* Allow them to attach debugger. */
+	sleep(30);
+#endif
+	syslog(LOG_DAEMON,
+	       "xenstored corruption: connection id %i: err %s: %s",
+	       conn ? (int)conn->id : -1, strerror(saved_errno), str);
+	_exit(2);
+}
+
+static bool write_message(struct connection *conn)
+{
+	int ret;
+	struct buffered_data *out = conn->out;
+
+	if (out->inhdr) {
+		if (verbose)
+			xprintf("Writing msg %i out to %p\n",
+				out->hdr.msg.type, conn);
+		ret = conn->write(conn, out->hdr.raw + out->used,
+				  sizeof(out->hdr) - out->used);
+		if (ret < 0)
+			return false;
+
+		out->used += ret;
+		if (out->used < sizeof(out->hdr))
+			return true;
+
+		out->inhdr = false;
+		out->used = 0;
+
+		/* Second write might block if non-zero. */
+		if (out->hdr.msg.len)
+			return true;
+	}
+
+	if (verbose)
+		xprintf("Writing data len %i out to %p\n",
+			out->hdr.msg.len, conn);
+	ret = conn->write(conn, out->buffer + out->used,
+			  out->hdr.msg.len - out->used);
+
+	if (ret < 0)
+		return false;
+
+	out->used += ret;
+	if (out->used != out->hdr.msg.len)
+		return true;
+
+	conn->out = NULL;
+
+	/* If this was an event, we wait for ack, otherwise we're done. */
+	if (!is_watch_event(conn, out))
+		talloc_free(out);
+
+	queue_next_event(conn);
+	return true;
+}
+
+static int destroy_conn(void *_conn)
+{
+	struct connection *conn = _conn;
+
+	/* Flush outgoing if possible, but don't block. */
+	if (!conn->domain) {
+		fd_set set;
+		struct timeval none;
+
+		FD_ZERO(&set);
+		FD_SET(conn->fd, &set);
+		none.tv_sec = none.tv_usec = 0;
+
+		while (conn->out
+		       && select(conn->fd+1, NULL, &set, NULL, &none) == 1)
+			if (!write_message(conn))
+				break;
+		close(conn->fd);
+	}
+	list_del(&conn->list);
+	return 0;
+}
+
+static int initialize_set(fd_set *inset, fd_set *outset, int sock, int ro_sock,
+			  int event_fd)
+{
+	struct connection *i;
+	int max;
+
+	FD_ZERO(inset);
+	FD_ZERO(outset);
+	FD_SET(sock, inset);
+	max = sock;
+	FD_SET(ro_sock, inset);
+	if (ro_sock > max)
+		max = ro_sock;
+	FD_SET(event_fd, inset);
+	if (event_fd > max)
+		max = event_fd;
+	list_for_each_entry(i, &connections, list) {
+		if (i->domain)
+			continue;
+		if (!i->blocked)
+			FD_SET(i->fd, inset);
+		if (i->out)
+			FD_SET(i->fd, outset);
+		if (i->fd > max)
+			max = i->fd;
+	}
+	return max;
+}
+
+/* Read everything from a talloc_open'ed fd. */
+static void *read_all(int *fd, unsigned int *size)
+{
+	unsigned int max = 4;
+	int ret;
+	void *buffer = talloc_size(fd, max);
+
+	*size = 0;
+	while ((ret = read(*fd, buffer + *size, max - *size)) > 0) {
+		*size += ret;
+		if (*size == max)
+			buffer = talloc_realloc_size(fd, buffer, max *= 2);
+	}
+	if (ret < 0)
+		return NULL;
+	return buffer;
+}
+
+static int destroy_fd(void *_fd)
+{
+	int *fd = _fd;
+	close(*fd);
+	return 0;
+}
+
+/* Return a pointer to an fd, self-closing and attached to this pathname. */
+static int *talloc_open(const char *pathname, int flags, int mode)
+{
+	int *fd;
+
+	fd = talloc(pathname, int);
+	*fd = open(pathname, flags, mode);
+	if (*fd < 0) {
+		int saved_errno = errno;
+		talloc_free(fd);
+		errno = saved_errno;
+		return NULL;
+	}
+	talloc_set_destructor(fd, destroy_fd);
+	return fd;
+}
+
+/* Is child a subnode of parent, or equal? */
+bool is_child(const char *child, const char *parent)
+{
+	unsigned int len = strlen(parent);
+
+	/* / should really be "" for this algorithm to work, but that's a
+	 * usability nightmare. */
+	if (streq(parent, "/"))
+		return true;
+
+	if (strncmp(child, parent, len) != 0)
+		return false;
+
+	return child[len] == '/' || child[len] == '\0';
+}
+
+/* Answer never ends in /. */
+char *node_dir_outside_transaction(const char *node)
+{
+	if (streq(node, "/"))
+		return talloc_strdup(node, xs_daemon_store());
+	return talloc_asprintf(node, "%s%s", xs_daemon_store(), node);
+}
+
+static char *node_dir(struct transaction *trans, const char *node)
+{
+	if (!trans || !within_transaction(trans, node))
+		return node_dir_outside_transaction(node);
+	return node_dir_inside_transaction(trans, node);
+}
+
+static char *node_datafile(struct transaction *trans, const char *node)
+{
+	return talloc_asprintf(node, "%s/.data", node_dir(trans, node));
+}
+
+static char *node_permfile(struct transaction *trans, const char *node)
+{
+	return talloc_asprintf(node, "%s/.perms", node_dir(trans, node));
+}
+
+struct buffered_data *new_buffer(void *ctx)
+{
+	struct buffered_data *data;
+
+	data = talloc(ctx, struct buffered_data);
+	data->inhdr = true;
+	data->used = 0;
+	data->buffer = NULL;
+
+	return data;
+}
+
+/* Return length of string (including nul) at this offset. */
+unsigned int get_string(const struct buffered_data *data, unsigned int offset)
+{
+	const char *nul;
+
+	if (offset >= data->used)
+		return 0;
+
+	nul = memchr(data->buffer + offset, 0, data->used - offset);
+	if (!nul)
+		return 0;
+
+	return nul - (data->buffer + offset) + 1;
+}
+
+/* Break input into vectors, return the number, fill in up to num of them. */
+unsigned int get_strings(struct buffered_data *data,
+			 char *vec[], unsigned int num)
+{
+	unsigned int off, i, len;
+
+	off = i = 0;
+	while ((len = get_string(data, off)) != 0) {
+		if (i < num)
+			vec[i] = data->buffer + off;
+		i++;
+		off += len;
+	}
+	return i;
+}
+
+/* Returns "false", meaning "connection is not blocked". */
+bool send_reply(struct connection *conn, enum xsd_sockmsg_type type,
+		const void *data, unsigned int len)
+{
+	struct buffered_data *bdata;
+
+	/* When data gets freed, we want list entry is destroyed (so
+	 * list entry is a child). */
+	bdata = new_buffer(conn);
+	bdata->buffer = talloc_array(bdata, char, len);
+
+	bdata->hdr.msg.type = type;
+	bdata->hdr.msg.len = len;
+	memcpy(bdata->buffer, data, len);
+
+	/* There might be an event going out now.  Queue behind it. */
+	if (conn->out) {
+		assert(conn->out->hdr.msg.type == XS_WATCH_EVENT);
+		assert(!conn->waiting_reply);
+		conn->waiting_reply = bdata;
+	} else
+		conn->out = bdata;
+	return false;
+}
+
+/* Some routines (write, mkdir, etc) just need a non-error return */
+bool send_ack(struct connection *conn, enum xsd_sockmsg_type type)
+{
+	return send_reply(conn, type, "OK", sizeof("OK"));
+}
+
+bool send_error(struct connection *conn, int error)
+{
+	unsigned int i;
+
+	for (i = 0; error != xsd_errors[i].errnum; i++)
+		if (i == ARRAY_SIZE(xsd_errors) - 1)
+			corrupt(conn, "Unknown error %i (%s)", error,
+				strerror(error));
+
+	return send_reply(conn, XS_ERROR, xsd_errors[i].errstring,
+			  strlen(xsd_errors[i].errstring) + 1);
+}
+
+static bool valid_chars(const char *node)
+{
+	/* Nodes can have lots of crap. */
+	return (strspn(node, 
+		       "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+		       "abcdefghijklmnopqrstuvwxyz"
+		       "0123456789-/_@") == strlen(node));
+}
+
+static bool is_valid_nodename(const char *node)
+{
+	/* Must start in /. */
+	if (!strstarts(node, "/"))
+		return false;
+
+	/* Cannot end in / (unless it's just "/"). */
+	if (strends(node, "/") && !streq(node, "/"))
+		return false;
+
+	/* No double //. */
+	if (strstr(node, "//"))
+		return false;
+
+	return valid_chars(node);
+}
+
+/* We expect one arg in the input: return NULL otherwise. */
+static const char *onearg(struct buffered_data *in)
+{
+	if (get_string(in, 0) != in->used)
+		return NULL;
+	return in->buffer;
+}
+
+/* If it fails, returns NULL and sets errno. */
+static struct xs_permissions *get_perms(struct transaction *transaction,
+					const char *node, unsigned int *num)
+{
+	unsigned int size;
+	char *strings;
+	struct xs_permissions *ret;
+	int *fd;
+
+	fd = talloc_open(node_permfile(transaction, node), O_RDONLY, 0);
+	if (!fd)
+		return NULL;
+	strings = read_all(fd, &size);
+	if (!strings)
+		return NULL;
+
+	*num = count_strings(strings, size);
+	ret = talloc_array(node, struct xs_permissions, *num);
+	if (!strings_to_perms(ret, *num, strings))
+		corrupt(NULL, "Permissions corrupt for %s", node);
+
+	return ret;
+}
+
+static char *perms_to_strings(const char *node,
+			      struct xs_permissions *perms, unsigned int num,
+			      unsigned int *len)
+{
+	unsigned int i;
+	char *strings = NULL;
+	char buffer[MAX_STRLEN(domid_t) + 1];
+
+	for (*len = 0, i = 0; i < num; i++) {
+		if (!perm_to_string(&perms[i], buffer))
+			return NULL;
+
+		strings = talloc_realloc(node, strings, char,
+					 *len + strlen(buffer) + 1);
+		strcpy(strings + *len, buffer);
+		*len += strlen(buffer) + 1;
+	}
+	return strings;
+}
+
+/* Destroy this, and its children, and its children's children. */
+int destroy_path(void *path)
+{
+	DIR *dir;
+	struct dirent *dirent;
+
+	dir = opendir(path);
+	if (!dir) {
+		if (unlink(path) == 0 || errno == ENOENT)
+			return 0;
+		corrupt(NULL, "Destroying path %s", path);
+	}
+
+	while ((dirent = readdir(dir)) != NULL) {
+		char fullpath[strlen(path) + 1 + strlen(dirent->d_name) + 1];
+		sprintf(fullpath, "%s/%s", (char *)path, dirent->d_name);
+		if (!streq(dirent->d_name,".") && !streq(dirent->d_name,".."))
+			destroy_path(fullpath);
+	}
+	closedir(dir);
+	if (rmdir(path) != 0)
+		corrupt(NULL, "Destroying directory %s", path);
+	return 0;
+}
+
+/* Create a self-destructing temporary file */
+static char *tempfile(const char *path, void *contents, unsigned int len)
+{
+	int *fd;
+	char *tmppath = talloc_asprintf(path, "%s.tmp", path);
+
+	fd = talloc_open(tmppath, O_WRONLY|O_CREAT|O_EXCL, 0640);
+	if (!fd)
+		return NULL;
+	talloc_set_destructor(tmppath, destroy_path);
+	if (!write_all(*fd, contents, len))
+		return NULL;
+
+	return tmppath;
+}
+
+/* We assume rename() doesn't fail on moves in same dir. */
+static void commit_tempfile(const char *path)
+{
+	char realname[strlen(path) + 1];
+	unsigned int len = strrchr(path, '.') - path;
+
+	memcpy(realname, path, len);
+	realname[len] = '\0';
+	if (rename(path, realname) != 0)
+		corrupt(NULL, "Committing %s", realname);
+	talloc_set_destructor(path, NULL);
+}
+
+static bool set_perms(struct transaction *transaction,
+		      const char *node,
+		      struct xs_permissions *perms, unsigned int num)
+{
+	unsigned int len;
+	char *permpath, *strings;
+
+	strings = perms_to_strings(node, perms, num, &len);
+	if (!strings)
+		return false;
+
+	/* Create then move. */
+	permpath = tempfile(node_permfile(transaction, node), strings, len);
+	if (!permpath)
+		return false;
+
+	commit_tempfile(permpath);
+	return true;
+}
+
+static char *get_parent(const char *node)
+{
+	char *slash = strrchr(node + 1, '/');
+	if (!slash)
+		return talloc_strdup(node, "/");
+	return talloc_asprintf(node, "%.*s", slash - node, node);
+}
+
+static enum xs_perm_type perm_for_id(domid_t id,
+				     struct xs_permissions *perms,
+				     unsigned int num)
+{
+	unsigned int i;
+
+	/* Owners and tools get it all... */
+	if (!id || perms[0].id == id)
+		return XS_PERM_READ|XS_PERM_WRITE|XS_PERM_CREATE|XS_PERM_OWNER;
+
+	for (i = 1; i < num; i++)
+		if (perms[i].id == id)
+			return perms[i].perms;
+
+	return perms[0].perms;
+}
+
+/* We have a weird permissions system.  You can allow someone into a
+ * specific node without allowing it in the parents.  If it's going to
+ * fail, however, we don't want the errno to indicate any information
+ * about the node. */
+static int check_with_parents(struct connection *conn, const char *node,
+			      int errnum)
+{
+	struct xs_permissions *perms;
+	unsigned int num;
+
+	/* We always tell them about memory failures. */
+	if (errnum == ENOMEM)
+		return errnum;
+
+	do {
+		node = get_parent(node);
+		perms = get_perms(conn->transaction, node, &num);
+		if (perms)
+			break;
+	} while (!streq(node, "/"));
+
+	/* No permission at root?  We're in trouble. */
+	if (!perms)
+		corrupt(conn, "No permissions file at root");
+
+	if (!(perm_for_id(conn->id, perms, num) & XS_PERM_READ))
+		return EACCES;
+
+	return errnum;
+}
+
+bool check_node_perms(struct connection *conn, const char *node,
+		      enum xs_perm_type perm)
+{
+	struct xs_permissions *perms;
+	unsigned int num;
+
+	if (!node) {
+		errno = EINVAL;
+		return false;
+	}
+
+	if (!node || !is_valid_nodename(node)) {
+		errno = EINVAL;
+		return false;
+	}
+
+	if (!conn->write && (perm & XS_PERM_WRITE)) {
+		errno = EROFS;
+		return false;
+	}
+
+	perms = get_perms(conn->transaction, node, &num);
+	/* No permissions.  If we want to create it and
+	 * it doesn't exist, check parent directory. */
+	if (!perms && errno == ENOENT && (perm & XS_PERM_CREATE)) {
+		char *parent = get_parent(node);
+		if (!parent)
+			return false;
+
+		perms = get_perms(conn->transaction, parent, &num);
+	}
+	if (!perms) {
+		errno = check_with_parents(conn, node, errno);
+		return false;
+	}
+
+	if (perm_for_id(conn->id, perms, num) & perm)
+		return true;
+
+	errno = check_with_parents(conn, node, EACCES);
+	return false;
+}
+
+static bool send_directory(struct connection *conn, const char *node)
+{
+	char *path, *reply = talloc_strdup(node, "");
+	unsigned int reply_len = 0;
+	DIR *dir;
+	struct dirent *dirent;
+
+	if (!check_node_perms(conn, node, XS_PERM_READ))
+		return send_error(conn, errno);
+
+	path = node_dir(conn->transaction, node);
+	dir = opendir(path);
+	if (!dir)
+		return send_error(conn, errno);
+
+	while ((dirent = readdir(dir)) != NULL) {
+		int len = strlen(dirent->d_name) + 1;
+
+		if (!valid_chars(dirent->d_name))
+			continue;
+
+		reply = talloc_realloc(path, reply, char, reply_len + len);
+		strcpy(reply + reply_len, dirent->d_name);
+		reply_len += len;
+	}
+	closedir(dir);
+
+	return send_reply(conn, XS_DIRECTORY, reply, reply_len);
+}
+
+static bool do_read(struct connection *conn, const char *node)
+{
+	char *value;
+	unsigned int size;
+	int *fd;
+
+	if (!check_node_perms(conn, node, XS_PERM_READ))
+		return send_error(conn, errno);
+
+	fd = talloc_open(node_datafile(conn->transaction, node), O_RDONLY, 0);
+	if (!fd) {
+		/* Data file doesn't exist?  We call that a directory */
+		if (errno == ENOENT)
+			errno = EISDIR;
+		return send_error(conn, errno);
+	}
+
+	value = read_all(fd, &size);
+	if (!value)
+		return send_error(conn, errno);
+
+	return send_reply(conn, XS_READ, value, size);
+}
+
+/* Create a new directory.  Optionally put data in it (if data != NULL) */
+static bool new_directory(struct connection *conn,
+			  const char *node, void *data, unsigned int datalen)
+{
+	struct xs_permissions perms;
+	char *permstr;
+	unsigned int len;
+	int *fd;
+	char *dir = node_dir(conn->transaction, node);
+
+	if (mkdir(dir, 0750) != 0)
+		return false;
+
+	/* Set destructor so we clean up if neccesary. */
+	talloc_set_destructor(dir, destroy_path);
+
+	/* Default permisisons: we own it, noone else has permission. */
+	perms.id = conn->id;
+	perms.perms = XS_PERM_NONE;
+
+	permstr = perms_to_strings(dir, &perms, 1, &len);
+	fd = talloc_open(node_permfile(conn->transaction, node),
+			 O_WRONLY|O_CREAT|O_EXCL, 0640);
+	if (!fd || !write_all(*fd, permstr, len))
+		return false;
+
+	if (data) {
+		char *datapath = node_datafile(conn->transaction, node);
+
+		fd = talloc_open(datapath, O_WRONLY|O_CREAT|O_EXCL, 0640);
+		if (!fd || !write_all(*fd, data, datalen))
+			return false;
+	}
+
+	/* Finished! */
+	talloc_set_destructor(dir, NULL);
+	return true;
+}
+
+/* path, flags, data... */
+static bool do_write(struct connection *conn, struct buffered_data *in)
+{
+	unsigned int offset, datalen;
+	char *vec[2];
+	char *node, *tmppath;
+	enum xs_perm_type mode;
+	struct stat st;
+
+	/* Extra "strings" can be created by binary data. */
+	if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
+		return send_error(conn, EINVAL);
+
+	node = vec[0];
+	if (!within_transaction(conn->transaction, node))
+		return send_error(conn, EROFS);
+
+	if (transaction_block(conn, node))
+		return true;
+
+	offset = strlen(vec[0]) + strlen(vec[1]) + 2;
+	datalen = in->used - offset;
+
+	if (streq(vec[1], XS_WRITE_NONE))
+		mode = XS_PERM_WRITE;
+	else if (streq(vec[1], XS_WRITE_CREATE))
+		mode = XS_PERM_WRITE|XS_PERM_CREATE;
+	else if (streq(vec[1], XS_WRITE_CREATE_EXCL))
+		mode = XS_PERM_WRITE|XS_PERM_CREATE;
+	else
+		return send_error(conn, EINVAL);
+
+	if (!check_node_perms(conn, node, mode))
+		return send_error(conn, errno);
+
+	if (lstat(node_dir(conn->transaction, node), &st) != 0) {
+		/* Does not exist... */
+		if (errno != ENOENT)
+			return send_error(conn, errno);
+
+		/* Not going to create it? */
+		if (!(mode & XS_PERM_CREATE))
+			return send_error(conn, ENOENT);
+
+		if (!new_directory(conn, node, in->buffer + offset, datalen))
+			return send_error(conn, errno);
+	} else {
+		/* Exists... */
+		if (streq(vec[1], XS_WRITE_CREATE_EXCL))
+			return send_error(conn, EEXIST);
+
+		tmppath = tempfile(node_datafile(conn->transaction, node),
+				   in->buffer + offset, datalen);
+		if (!tmppath)
+			return send_error(conn, errno);
+
+		commit_tempfile(tmppath);
+	}
+
+	add_change_node(conn->transaction, node);
+	send_ack(conn, XS_WRITE);
+	fire_watches(conn->transaction, node);
+	return false;
+}
+
+static bool do_mkdir(struct connection *conn, const char *node)
+{
+	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_CREATE))
+		return send_error(conn, errno);
+
+	if (!within_transaction(conn->transaction, node))
+		return send_error(conn, EROFS);
+
+	if (transaction_block(conn, node))
+		return true;
+
+	if (!new_directory(conn, node, NULL, 0))
+		return send_error(conn, errno);
+
+	add_change_node(conn->transaction, node);
+	send_ack(conn, XS_MKDIR);
+	fire_watches(conn->transaction, node);
+	return false;
+}
+
+static bool do_rm(struct connection *conn, const char *node)
+{
+	char *tmppath, *path;
+
+	if (!check_node_perms(conn, node, XS_PERM_WRITE))
+		return send_error(conn, errno);
+
+	if (!within_transaction(conn->transaction, node))
+		return send_error(conn, EROFS);
+
+	if (transaction_block(conn, node))
+		return true;
+
+	if (streq(node, "/"))
+		return send_error(conn, EINVAL);
+
+	/* We move the directory to temporary name, destructor cleans up. */
+	path = node_dir(conn->transaction, node);
+	tmppath = talloc_asprintf(node, "%s.tmp", path);
+	talloc_set_destructor(tmppath, destroy_path);
+
+	if (rename(path, tmppath) != 0)
+		return send_error(conn, errno);
+
+	add_change_node(conn->transaction, node);
+	send_ack(conn, XS_RM);
+	fire_watches(conn->transaction, node);
+	return false;
+}
+
+static bool do_get_perms(struct connection *conn, const char *node)
+{
+	struct xs_permissions *perms;
+	char *strings;
+	unsigned int len, num;
+
+	if (!check_node_perms(conn, node, XS_PERM_READ))
+		return send_error(conn, errno);
+
+	perms = get_perms(conn->transaction, node, &num);
+	if (!perms)
+		return send_error(conn, errno);
+
+	strings = perms_to_strings(node, perms, num, &len);
+	if (!strings)
+		return send_error(conn, errno);
+
+	return send_reply(conn, XS_GET_PERMS, strings, len);
+}
+
+static bool do_set_perms(struct connection *conn, struct buffered_data *in)
+{
+	unsigned int num;
+	char *node;
+	struct xs_permissions *perms;
+
+	num = count_strings(in->buffer, in->used);
+	if (num < 2)
+		return send_error(conn, EINVAL);
+
+	/* First arg is node name. */
+	node = in->buffer;
+	in->buffer += strlen(in->buffer) + 1;
+	num--;
+
+	if (!within_transaction(conn->transaction, node))
+		return send_error(conn, EROFS);
+
+	if (transaction_block(conn, node))
+		return true;
+
+	/* We must own node to do this (tools can do this too). */
+	if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_OWNER))
+		return send_error(conn, errno);
+
+	perms = talloc_array(node, struct xs_permissions, num);
+	if (!strings_to_perms(perms, num, in->buffer))
+		return send_error(conn, errno);
+
+	if (!set_perms(conn->transaction, node, perms, num))
+		return send_error(conn, errno);
+	add_change_node(conn->transaction, node);
+	send_ack(conn, XS_SET_PERMS);
+	fire_watches(conn->transaction, node);
+	return false;
+}
+
+/* Process "in" for conn: "in" will vanish after this conversation, so
+ * we can talloc off it for temporary variables.  May free "conn".
+ * Returns true if can't complete due to block.
+ */
+static bool process_message(struct connection *conn, struct buffered_data *in)
+{
+	switch (in->hdr.msg.type) {
+	case XS_DIRECTORY:
+		return send_directory(conn, onearg(in));
+
+	case XS_READ:
+		return do_read(conn, onearg(in));
+
+	case XS_WRITE:
+		return do_write(conn, in);
+
+	case XS_MKDIR:
+		return do_mkdir(conn, onearg(in));
+
+	case XS_RM:
+		return do_rm(conn, onearg(in));
+
+	case XS_GET_PERMS:
+		return do_get_perms(conn, onearg(in));
+
+	case XS_SET_PERMS:
+		return do_set_perms(conn, in);
+
+	case XS_SHUTDOWN:
+		send_ack(conn, XS_SHUTDOWN);
+		/* Everything hangs off auto-free context, freed at exit. */
+		exit(0);
+
+#ifdef TESTING
+	case XS_DEBUG: {
+		/* For testing, we allow them to set id. */
+		if (streq(in->buffer, "setid")) {
+			conn->id = atoi(in->buffer + get_string(in, 0));
+			send_ack(conn, XS_DEBUG);
+		} else if (streq(in->buffer, "failtest")) {
+			if (get_string(in, 0) < in->used)
+				srandom(atoi(in->buffer + get_string(in, 0)));
+			send_ack(conn, XS_DEBUG);
+			failtest = true;
+		}
+		return false;
+	}
+#endif /* TESTING */
+
+	case XS_WATCH:
+		return do_watch(conn, in);
+
+	case XS_WATCH_ACK:
+		return do_watch_ack(conn);
+
+	case XS_UNWATCH:
+		return do_unwatch(conn, onearg(in));
+
+	case XS_TRANSACTION_START:
+		return do_transaction_start(conn, onearg(in));
+
+	case XS_TRANSACTION_END:
+		return do_transaction_end(conn, onearg(in));
+
+	case XS_INTRODUCE:
+		return do_introduce(conn, in);
+
+	case XS_RELEASE:
+		return do_release(conn, onearg(in));
+
+	case XS_GETDOMAINPATH:
+		return do_get_domain_path(conn, onearg(in));
+
+	case XS_WATCH_EVENT:
+	default:
+		eprintf("Client unknown operation %i", in->hdr.msg.type);
+		send_error(conn, ENOSYS);
+		return false;
+	}
+}
+
+static int out_of_mem(void *data)
+{
+	longjmp(*(jmp_buf *)data, 1);
+}
+
+static void consider_message(struct connection *conn)
+{
+	struct buffered_data *in = NULL;
+	enum xsd_sockmsg_type type = conn->in->hdr.msg.type;
+	jmp_buf talloc_fail;
+
+	/* For simplicity, we kill the connection on OOM. */
+	talloc_set_fail_handler(out_of_mem, &talloc_fail);
+	if (setjmp(talloc_fail)) {
+		talloc_free(conn);
+		goto end;
+	}
+
+	if (verbose)
+		xprintf("Got message %i len %i from %p\n",
+			type, conn->in->hdr.msg.len, conn);
+
+	/* We might get a command while waiting for an ack: this means
+	 * the other end discarded it: we will re-transmit. */
+	if (type != XS_WATCH_ACK)
+		reset_watch_event(conn);
+
+	/* Careful: process_message may free connection.  We detach
+	 * "in" beforehand and allocate the new buffer to avoid
+	 * touching conn after process_message.
+	 */
+	in = talloc_steal(talloc_autofree_context(), conn->in);
+	conn->in = new_buffer(conn);
+	if (process_message(conn, in)) {
+		/* Blocked by transaction: queue for re-xmit. */
+		talloc_free(conn->in);
+		conn->in = in;
+		in = NULL;
+	}
+
+end:
+	talloc_free(in);
+	talloc_set_fail_handler(NULL, NULL);
+	if (talloc_total_blocks(NULL)
+	    != talloc_total_blocks(talloc_autofree_context()) + 1)
+		talloc_report_full(NULL, stderr);
+}
+
+/* Errors in reading or allocating here mean we get out of sync, so we
+ * drop the whole client connection. */
+void handle_input(struct connection *conn)
+{
+	int bytes;
+	struct buffered_data *in;
+
+	assert(!conn->blocked);
+	in = conn->in;
+
+	/* Not finished header yet? */
+	if (in->inhdr) {
+		bytes = conn->read(conn, in->hdr.raw + in->used,
+				   sizeof(in->hdr) - in->used);
+		if (bytes <= 0)
+			goto bad_client;
+		in->used += bytes;
+		if (in->used != sizeof(in->hdr))
+			return;
+
+		if (in->hdr.msg.len > PATH_MAX) {
+			syslog(LOG_DAEMON, "Client tried to feed us %i",
+			       in->hdr.msg.len);
+			goto bad_client;
+		}
+
+		in->buffer = talloc_array(in, char, in->hdr.msg.len);
+		if (!in->buffer)
+			goto bad_client;
+		in->used = 0;
+		in->inhdr = false;
+		return;
+	}
+
+	bytes = conn->read(conn, in->buffer + in->used,
+			   in->hdr.msg.len - in->used);
+	if (bytes < 0)
+		goto bad_client;
+
+	in->used += bytes;
+	if (in->used != in->hdr.msg.len)
+		return;
+
+	consider_message(conn);
+	return;
+
+bad_client:
+	/* Kill it. */
+	talloc_free(conn);
+}
+
+void handle_output(struct connection *conn)
+{
+	if (!write_message(conn))
+		talloc_free(conn);
+}
+
+/* If a transaction has ended, see if we can unblock any connections. */
+static void unblock_connections(void)
+{
+	struct connection *i, *tmp;
+
+	list_for_each_entry_safe(i, tmp, &connections, list) {
+		if (!i->blocked)
+			continue;
+
+		if (!transaction_covering_node(i->blocked)) {
+			talloc_free(i->blocked);
+			i->blocked = NULL;
+			consider_message(i);
+		}
+	}
+
+	/* To balance bias, move first entry to end. */
+	if (!list_empty(&connections)) {
+		i = list_top(&connections, struct connection, list);
+		list_del(&i->list);
+		list_add_tail(&i->list, &connections);
+	}
+}
+
+struct connection *new_connection(connwritefn_t *write, connreadfn_t *read)
+{
+	struct connection *new;
+	jmp_buf talloc_fail;
+
+	new = talloc(talloc_autofree_context(), struct connection);
+	if (!new)
+		return NULL;
+
+	new->blocked = false;
+	new->out = new->waiting_reply = NULL;
+	new->event = NULL;
+	new->fd = -1;
+	new->id = 0;
+	new->domain = NULL;
+	new->transaction = NULL;
+	new->write = write;
+	new->read = read;
+
+	talloc_set_fail_handler(out_of_mem, &talloc_fail);
+	if (setjmp(talloc_fail)) {
+		talloc_free(new);
+		return NULL;
+	}
+	new->in = new_buffer(new);
+	talloc_set_fail_handler(NULL, NULL);
+
+	list_add_tail(&new->list, &connections);
+	talloc_set_destructor(new, destroy_conn);
+	return new;
+}
+
+static int writefd(struct connection *conn, const void *data, unsigned int len)
+{
+	return write(conn->fd, data, len);
+}
+
+static int readfd(struct connection *conn, void *data, unsigned int len)
+{
+	return read(conn->fd, data, len);
+}
+
+static void accept_connection(int sock, bool canwrite)
+{
+	int fd;
+	struct connection *conn;
+
+	fd = accept(sock, NULL, NULL);
+	if (fd < 0)
+		return;
+
+	conn = new_connection(canwrite ? writefd : NULL, readfd);
+	if (conn)
+		conn->fd = fd;
+	else
+		close(fd);
+}
+
+/* Calc timespan from now to absolute time. */
+static void time_relative_to_now(struct timeval *tv)
+{
+	struct timeval now;
+
+	gettimeofday(&now, NULL);
+	if (timercmp(&now, tv, >))
+		timerclear(tv);
+	else {
+		tv->tv_sec -= now.tv_sec;
+		if (now.tv_usec > tv->tv_usec) {
+			tv->tv_sec--;
+			tv->tv_usec += 1000000;
+		}
+		tv->tv_usec -= now.tv_usec;
+	}
+}
+
+static struct option options[] = { { "no-fork", 0, NULL, 'N' },
+				   { "verbose", 0, NULL, 'V' },
+				   { "output-pid", 0, NULL, 'P' },
+				   { NULL, 0, NULL, 0 } };
+
+int main(int argc, char *argv[])
+{
+	int opt, *sock, *ro_sock, event_fd, max, tmpout;
+	struct sockaddr_un addr;
+	fd_set inset, outset;
+	bool dofork = true;
+	bool outputpid = false;
+
+	while ((opt = getopt_long(argc, argv, "DV", options, NULL)) != -1) {
+		switch (opt) {
+		case 'N':
+			dofork = false;
+			break;
+		case 'V':
+			verbose = true;
+			break;
+		case 'P':
+			outputpid = true;
+			break;
+		}
+	}
+	if (optind != argc)
+		barf("%s: No arguments desired", argv[0]);
+
+	talloc_enable_leak_report_full();
+
+	/* Create sockets for them to listen to. */
+	sock = talloc(talloc_autofree_context(), int);
+	*sock = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (*sock < 0)
+		barf_perror("Could not create socket");
+	ro_sock = talloc(talloc_autofree_context(), int);
+	*ro_sock = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (*ro_sock < 0)
+		barf_perror("Could not create socket");
+	talloc_set_destructor(sock, destroy_fd);
+	talloc_set_destructor(ro_sock, destroy_fd);
+
+	/* Don't kill us with SIGPIPE. */
+	signal(SIGPIPE, SIG_IGN);
+
+	/* FIXME: Be more sophisticated, don't mug running daemon. */
+	unlink(xs_daemon_socket());
+	unlink(xs_daemon_socket_ro());
+
+	addr.sun_family = AF_UNIX;
+	strcpy(addr.sun_path, xs_daemon_socket());
+	if (bind(*sock, (struct sockaddr *)&addr, sizeof(addr)) != 0)
+		barf_perror("Could not bind socket to %s", xs_daemon_socket());
+	strcpy(addr.sun_path, xs_daemon_socket_ro());
+	if (bind(*ro_sock, (struct sockaddr *)&addr, sizeof(addr)) != 0)
+		barf_perror("Could not bind socket to %s",
+			    xs_daemon_socket_ro());
+	if (chmod(xs_daemon_socket(), 0600) != 0
+	    || chmod(xs_daemon_socket_ro(), 0660) != 0)
+		barf_perror("Could not chmod sockets");
+
+	if (listen(*sock, 1) != 0
+	    || listen(*ro_sock, 1) != 0)
+		barf_perror("Could not listen on sockets");
+
+	/* If we're the first, create .perms file for root. */
+	if (mkdir(xs_daemon_store(), 0750) == 0) {
+		struct xs_permissions perms;
+		char *root = talloc_strdup(talloc_autofree_context(), "/");
+
+		perms.id = 0;
+		perms.perms = XS_PERM_READ;
+		if (!set_perms(NULL, root, &perms, 1))
+			barf_perror("Could not create permissions in root");
+		talloc_free(root);
+		mkdir(xs_daemon_transactions(), 0750);
+	} else if (errno != EEXIST)
+		barf_perror("Could not create root %s", xs_daemon_store());
+
+	/* Listen to hypervisor. */
+	event_fd = domain_init();
+
+	/* Debugging: daemonize() closes standard fds, so dup here. */
+	tmpout = dup(STDOUT_FILENO);
+	if (dofork) {
+		openlog("xenstored", 0, LOG_DAEMON);
+		daemonize();
+	}
+
+	if (outputpid) {
+		char buffer[20];
+		sprintf(buffer, "%i\n", getpid());
+		write(tmpout, buffer, strlen(buffer));
+	}
+	close(tmpout);
+
+#ifdef TESTING
+	signal(SIGUSR1, stop_failtest);
+#endif
+
+	/* Get ready to listen to the tools. */
+	max = initialize_set(&inset, &outset, *sock, *ro_sock, event_fd);
+
+	/* Main loop. */
+	for (;;) {
+		struct connection *i;
+		struct timeval *tvp = NULL, tv;
+
+		timerclear(&tv);
+		shortest_transaction_timeout(&tv);
+		if (timerisset(&tv)) {
+			time_relative_to_now(&tv);
+			tvp = &tv;
+		}
+
+		if (select(max+1, &inset, &outset, NULL, tvp) < 0) {
+			if (errno == EINTR)
+				continue;
+			barf_perror("Select failed");
+		}
+
+		if (FD_ISSET(*sock, &inset))
+			accept_connection(*sock, true);
+
+		if (FD_ISSET(*ro_sock, &inset))
+			accept_connection(*ro_sock, false);
+
+		if (FD_ISSET(event_fd, &inset))
+			handle_event(event_fd);
+
+		list_for_each_entry(i, &connections, list) {
+			if (i->domain)
+				continue;
+
+			/* Operations can delete themselves or others
+			 * (xs_release): list is not safe after input,
+			 * so break. */
+			if (FD_ISSET(i->fd, &inset)) {
+				handle_input(i);
+				break;
+			}
+			if (FD_ISSET(i->fd, &outset)) {
+				handle_output(i);
+				break;
+			}
+		}
+
+		if (tvp)
+			check_transaction_timeout();
+
+		/* If transactions ended, we might be able to do more work. */
+		unblock_connections();
+
+		max = initialize_set(&inset, &outset, *sock,*ro_sock,event_fd);
+	}
+}
diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h
new file mode 100644
index 0000000000..fe6eec8f72
--- /dev/null
+++ b/tools/xenstore/xenstored_core.h
@@ -0,0 +1,123 @@
+/* 
+    Internal interfaces for Xen Store Daemon.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#ifndef _XENSTORED_INTERNAL_H
+#define _XENSTORED_INTERNAL_H
+#include <stdbool.h>
+#include <stdint.h>
+#include <errno.h>
+#include "xs_lib.h"
+#include "xenstored.h"
+#include "list.h"
+
+struct buffered_data
+{
+	/* Are we still doing the header? */
+	bool inhdr;
+	/* How far are we? */
+	unsigned int used;
+	union {
+		struct xsd_sockmsg msg;
+		char raw[sizeof(struct xsd_sockmsg)];
+	} hdr;
+	/* The actual data. */
+	char *buffer;
+};
+
+struct connection;
+typedef int connwritefn_t(struct connection *, const void *, unsigned int);
+typedef int connreadfn_t(struct connection *, void *, unsigned int);
+
+struct connection
+{
+	struct list_head list;
+
+	/* The file descriptor we came in on. */
+	int fd;
+
+	/* Who am I?  0 for socket connections. */
+	domid_t id;
+
+	/* Are we blocked waiting for a transaction to end?  Contains node. */
+	char *blocked;
+
+	/* Our current event.  If all used, we're waiting for ack. */
+	struct watch_event *event;
+
+	/* Buffered incoming data. */
+	struct buffered_data *in;
+
+	/* Buffered output data */
+	struct buffered_data *out;
+
+	/* If we had a watch fire outgoing when we needed to reply... */
+	struct buffered_data *waiting_reply;
+
+	/* My transaction, if any. */
+	struct transaction *transaction;
+
+	/* The domain I'm associated with, if any. */
+	struct domain *domain;
+
+	/* Methods for communicating over this connection: write can be NULL */
+	connwritefn_t *write;
+	connreadfn_t *read;
+};
+
+/* Return length of string (including nul) at this offset. */
+unsigned int get_string(const struct buffered_data *data,
+			unsigned int offset);
+
+/* Break input into vectors, return the number, fill in up to num of them. */
+unsigned int get_strings(struct buffered_data *data,
+			 char *vec[], unsigned int num);
+
+/* Is child node a child or equal to parent node? */
+bool is_child(const char *child, const char *parent);
+
+/* Create a new buffer with lifetime of context. */
+struct buffered_data *new_buffer(void *ctx);
+
+bool send_reply(struct connection *conn, enum xsd_sockmsg_type type,
+                const void *data, unsigned int len);
+
+/* Some routines (write, mkdir, etc) just need a non-error return */
+bool send_ack(struct connection *conn, enum xsd_sockmsg_type type);
+
+/* Send an error: error is usually "errno". */
+bool send_error(struct connection *conn, int error);
+
+/* Check permissions on this node. */
+bool check_node_perms(struct connection *conn, const char *node,
+		      enum xs_perm_type perm);
+
+/* Path to this node outside transaction. */
+char *node_dir_outside_transaction(const char *node);
+
+/* Fail due to excessive corruption, capitalist pigdogs! */
+void __attribute__((noreturn)) corrupt(struct connection *conn,
+				       const char *fmt, ...);
+
+struct connection *new_connection(connwritefn_t *write, connreadfn_t *read);
+
+void handle_input(struct connection *conn);
+void handle_output(struct connection *conn);
+
+/* Convenient talloc-style destructor for paths. */
+int destroy_path(void *path);
+#endif /* _XENSTORED_INTERNAL_H */
diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c
new file mode 100644
index 0000000000..bcc0a64967
--- /dev/null
+++ b/tools/xenstore/xenstored_domain.c
@@ -0,0 +1,387 @@
+/* 
+    Domain communications for Xen Store Daemon.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdio.h>
+#include <linux/ioctl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+//#define DEBUG
+#include "utils.h"
+#include "talloc.h"
+#include "xenstored_core.h"
+#include "xenstored_domain.h"
+#include "xenstored_test.h"
+
+static int *xc_handle;
+static int eventchn_fd;
+static unsigned int ringbuf_datasize;
+
+struct domain
+{
+	struct list_head list;
+
+	/* The id of this domain */
+	domid_t domid;
+
+	/* Event channel port */
+	u16 port;
+
+	/* Domain path in store. */
+	char *path;
+
+	/* Shared page. */
+	void *page;
+
+	/* Input and output ringbuffer heads. */
+	struct ringbuf_head *input, *output;
+
+	/* The connection associated with this. */
+	struct connection *conn;
+
+};
+
+static LIST_HEAD(domains);
+
+void domain_set_conn(struct domain *domain, struct connection *conn)
+{
+	domain->conn = conn;
+}
+
+struct ringbuf_head
+{
+	u32 write; /* Next place to write to */
+	u32 read; /* Next place to read from */
+	u8 flags;
+	char buf[0];
+} __attribute__((packed));
+
+#define EVENTCHN_BIND		_IO('E', 2)
+#define EVENTCHN_UNBIND 	_IO('E', 3)
+
+/* FIXME: Mark connection as broken (close it?) when this happens. */
+static bool check_buffer(const struct ringbuf_head *h)
+{
+	return (h->write < ringbuf_datasize && h->read < ringbuf_datasize);
+}
+
+/* We can't fill last byte: would look like empty buffer. */
+static void *get_output_chunk(const struct ringbuf_head *h,
+			      void *buf, u32 *len)
+{
+	u32 read_mark;
+
+	if (h->read == 0)
+		read_mark = ringbuf_datasize - 1;
+	else
+		read_mark = h->read - 1;
+
+	/* Here to the end of buffer, unless they haven't read some out. */
+	*len = ringbuf_datasize - h->write;
+	if (read_mark >= h->write)
+		*len = read_mark - h->write;
+	return buf + h->write;
+}
+
+static const void *get_input_chunk(const struct ringbuf_head *h,
+				   const void *buf, u32 *len)
+{
+	/* Here to the end of buffer, unless they haven't written some. */
+	*len = ringbuf_datasize - h->read;
+	if (h->write >= h->read)
+		*len = h->write - h->read;
+	return buf + h->read;
+}
+
+static void update_output_chunk(struct ringbuf_head *h, u32 len)
+{
+	h->write += len;
+	if (h->write == ringbuf_datasize)
+		h->write = 0;
+}
+
+static void update_input_chunk(struct ringbuf_head *h, u32 len)
+{
+	h->read += len;
+	if (h->read == ringbuf_datasize)
+		h->read = 0;
+}
+
+static bool buffer_has_input(const struct ringbuf_head *h)
+{
+	u32 len;
+
+	get_input_chunk(h, NULL, &len);
+	return (len != 0);
+}
+
+static bool buffer_has_output_room(const struct ringbuf_head *h)
+{
+	u32 len;
+
+	get_output_chunk(h, NULL, &len);
+	return (len != 0);
+}
+
+static int writechn(struct connection *conn, const void *data, unsigned int len)
+{
+	u32 avail;
+	void *dest;
+	struct ringbuf_head h;
+
+	/* Must read head once, and before anything else, and verified. */
+	h = *conn->domain->output;
+	mb();
+	if (!check_buffer(&h)) {
+		errno = EIO;
+		return -1;
+	}
+
+	dest = get_output_chunk(&h, conn->domain->output->buf, &avail);
+	if (avail < len)
+		len = avail;
+
+	memcpy(dest, data, len);
+	mb();
+	update_output_chunk(conn->domain->output, len);
+	/* FIXME: Probably not neccessary. */
+	mb();
+	xc_evtchn_send(*xc_handle, conn->domain->port);
+	return len;
+}
+
+static int readchn(struct connection *conn, void *data, unsigned int len)
+{
+	u32 avail;
+	const void *src;
+	struct ringbuf_head h;
+	bool was_full;
+
+	/* Must read head once, and before anything else, and verified. */
+	h = *conn->domain->input;
+	mb();
+
+	if (!check_buffer(&h)) {
+		errno = EIO;
+		return -1;
+	}
+
+	src = get_input_chunk(&h, conn->domain->input->buf, &avail);
+	if (avail < len)
+		len = avail;
+
+	was_full = !buffer_has_output_room(&h);
+	memcpy(data, src, len);
+	mb();
+	update_input_chunk(conn->domain->input, len);
+	/* FIXME: Probably not neccessary. */
+	mb();
+
+	/* If it was full, tell them we've taken some. */
+	if (was_full)
+		xc_evtchn_send(*xc_handle, conn->domain->port);
+	return len;
+}
+
+static int destroy_domain(void *_domain)
+{
+	struct domain *domain = _domain;
+
+	list_del(&domain->list);
+
+	if (domain->port &&
+	    (ioctl(eventchn_fd, EVENTCHN_UNBIND, domain->port) != 0))
+		eprintf("> Unbinding port %i failed!\n", domain->port);
+
+	if(domain->page)
+		munmap(domain->page, getpagesize());
+
+	return 0;
+}
+
+static struct domain *find_domain(u16 port)
+{
+	struct domain *i;
+
+	list_for_each_entry(i, &domains, list) {
+		if (i->port == port)
+			return i;
+	}
+	return NULL;
+}
+
+void handle_event(int event_fd)
+{
+	u16 port;
+	struct domain *domain;
+
+	if (read(event_fd, &port, sizeof(port)) != sizeof(port))
+		barf_perror("Failed to read from event fd");
+
+	/* We have to handle *all* the data available before we ack:
+	 * careful that handle_input/handle_output can destroy conn.
+	 */
+	while ((domain = find_domain(port)) != NULL) {
+		if (!domain->conn->blocked && buffer_has_input(domain->input))
+			handle_input(domain->conn);
+		else if (domain->conn->out
+			 && buffer_has_output_room(domain->output))
+			handle_output(domain->conn);
+		else
+			break;
+	}
+
+#ifndef TESTING
+	if (write(event_fd, &port, sizeof(port)) != sizeof(port))
+		barf_perror("Failed to write to event fd");
+#endif
+}
+
+/* domid, mfn, evtchn, path */
+bool do_introduce(struct connection *conn, struct buffered_data *in)
+{
+	struct domain *domain;
+	char *vec[4];
+
+	if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
+		return send_error(conn, EINVAL);
+
+	/* Hang domain off "in" until we're finished. */
+	domain = talloc(in, struct domain);
+	domain->domid = atoi(vec[0]);
+	domain->port = atoi(vec[2]);
+	domain->path = talloc_strdup(domain, vec[3]);
+	talloc_set_destructor(domain, destroy_domain);
+	if (!domain->port || !domain->domid)
+		return send_error(conn, EINVAL);
+	domain->page = xc_map_foreign_range(*xc_handle, domain->domid,
+					    getpagesize(),
+					    PROT_READ|PROT_WRITE,
+					    atol(vec[1]));
+	if (!domain->page)
+		return send_error(conn, errno);
+
+	/* One in each half of page. */
+	domain->input = domain->page;
+	domain->output = domain->page + getpagesize()/2;
+
+	/* Tell kernel we're interested in this event. */
+	if (ioctl(eventchn_fd, EVENTCHN_BIND, domain->port) != 0)
+		return send_error(conn, errno);
+
+	domain->conn = new_connection(writechn, readchn);
+	domain->conn->domain = domain;
+
+	talloc_steal(domain->conn, domain);
+	list_add(&domain->list, &domains);
+
+	return send_ack(conn, XS_INTRODUCE);
+}
+
+static struct domain *find_domain_by_domid(domid_t domid)
+{
+	struct domain *i;
+
+	list_for_each_entry(i, &domains, list) {
+		if (i->domid == domid)
+			return i;
+	}
+	return NULL;
+}
+
+/* domid */
+bool do_release(struct connection *conn, const char *domid_str)
+{
+	struct domain *domain;
+	domid_t domid;
+
+	if (!domid_str)
+		return send_error(conn, EINVAL);
+
+	domid = atoi(domid_str);
+	if (!domid)
+		return send_error(conn, EINVAL);
+
+	domain = find_domain_by_domid(domid);
+	if (!domain)
+		return send_error(conn, ENOENT);
+
+	if (!domain->conn)
+		return send_error(conn, EINVAL);
+
+	talloc_free(domain->conn);
+	return send_ack(conn, XS_RELEASE);
+}
+
+bool do_get_domain_path(struct connection *conn, const char *domid_str)
+{
+	struct domain *domain;
+	domid_t domid;
+
+	if (!domid_str)
+		return send_error(conn, EINVAL);
+
+	domid = atoi(domid_str);
+	if (domid == 0)
+		domain = conn->domain;
+	else
+		domain = find_domain_by_domid(domid);
+
+	if (!domain)
+		return send_error(conn, ENOENT);
+
+	return send_reply(conn, XS_GETDOMAINPATH, domain->path,
+			  strlen(domain->path) + 1);
+}
+
+static int close_xc_handle(void *_handle)
+{
+	xc_interface_close(*(int *)_handle);
+	return 0;
+}
+
+/* Returns the event channel handle. */
+int domain_init(void)
+{
+	/* The size of the ringbuffer: half a page minus head structure. */
+	ringbuf_datasize = getpagesize() / 2 - sizeof(struct ringbuf_head);
+
+	xc_handle = talloc(talloc_autofree_context(), int);
+	if (!xc_handle)
+		barf_perror("Failed to allocate domain handle");
+	*xc_handle = xc_interface_open();
+	if (*xc_handle < 0)
+		barf_perror("Failed to open connection to hypervisor");
+	talloc_set_destructor(xc_handle, close_xc_handle);
+
+#ifdef TESTING
+	eventchn_fd = fake_open_eventchn();
+#else
+	eventchn_fd = open("/dev/xen/evtchn", O_RDWR);
+#endif
+	if (eventchn_fd < 0)
+		barf_perror("Failed to open connection to hypervisor");
+	return eventchn_fd;
+}
diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h
new file mode 100644
index 0000000000..20e85a54b5
--- /dev/null
+++ b/tools/xenstore/xenstored_domain.h
@@ -0,0 +1,38 @@
+/* 
+    Domain communications for Xen Store Daemon.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#ifndef _XENSTORED_DOMAIN_H
+#define _XENSTORED_DOMAIN_H
+
+void handle_event(int event_fd);
+
+/* domid, mfn, eventchn, path */
+bool do_introduce(struct connection *conn, struct buffered_data *in);
+
+/* domid */
+bool do_release(struct connection *conn, const char *domid_str);
+
+/* domid */
+bool do_get_domain_path(struct connection *conn, const char *domid_str);
+
+/* Returns the event channel handle */
+int domain_init(void);
+
+void domain_set_conn(struct domain *domain, struct connection *conn);
+
+#endif /* _XENSTORED_DOMAIN_H */
diff --git a/tools/xenstore/xenstored_test.h b/tools/xenstore/xenstored_test.h
new file mode 100644
index 0000000000..f173a5ca91
--- /dev/null
+++ b/tools/xenstore/xenstored_test.h
@@ -0,0 +1,37 @@
+/* 
+    Testing replcements for Xen Store Daemon.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#ifndef _XENSTORED_TEST_H
+#define _XENSTORED_TEST_H
+
+#ifdef TESTING
+bool test_write_all(int fd, void *contents, unsigned int len);
+#define write_all test_write_all
+
+int test_mkdir(const char *dir, int perms);
+#define mkdir test_mkdir
+
+int fake_open_eventchn(void);
+void fake_block_events(void);
+void fake_ack_event(void);
+
+#define ioctl(a,b,c) 0
+
+#endif
+
+#endif /* _XENSTORED_INTERNAL_H */
diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c
new file mode 100644
index 0000000000..ca37307f8c
--- /dev/null
+++ b/tools/xenstore/xenstored_transaction.c
@@ -0,0 +1,284 @@
+/* 
+    Transaction code for Xen Store Daemon.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/time.h>
+#include <time.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include "talloc.h"
+#include "list.h"
+#include "xenstored_transaction.h"
+#include "xenstored_watch.h"
+#include "xs_lib.h"
+#include "utils.h"
+#include "xenstored_test.h"
+
+struct changed_node
+{
+	/* The list within this transaction. */
+	struct list_head list;
+
+	/* The name of the node. */
+	char *node;
+};
+
+struct transaction
+{
+	/* Global list of transactions. */
+	struct list_head list;
+
+	/* My owner (conn->transaction == me). */
+	struct connection *conn;
+
+	/* Subtree this transaction covers */
+	char *node;
+
+	/* Base for this transaction. */
+	char *divert;
+
+	/* List of changed nodes. */
+	struct list_head changes;
+
+	/* Someone's waiting: time limit. */
+	struct timeval timeout;
+
+	/* We've timed out. */
+	bool destined_to_fail;
+};
+static LIST_HEAD(transactions);
+
+bool within_transaction(struct transaction *trans, const char *node)
+{
+	if (!trans)
+		return true;
+	return is_child(node, trans->node);
+}
+
+/* You are on notice: this transaction is blocking someone. */
+static void start_transaction_timeout(struct transaction *trans)
+{
+	if (timerisset(&trans->timeout))
+		return;
+
+	/* One second timeout. */
+	gettimeofday(&trans->timeout, NULL);
+	trans->timeout.tv_sec += 1;
+}
+
+struct transaction *transaction_covering_node(const char *node)
+{
+	struct transaction *i;
+
+	list_for_each_entry(i, &transactions, list) {
+		if (i->destined_to_fail)
+			continue;
+		if (is_child(i->node, node) || is_child(node, i->node))
+			return i;
+	}
+	return NULL;
+}
+
+bool transaction_block(struct connection *conn, const char *node)
+{
+	struct transaction *trans;
+
+	/* Transactions don't overlap, so we can't be blocked by
+	 * others if we're in one. */
+	if (conn->transaction)
+		return false;
+
+	trans = transaction_covering_node(node);
+	if (trans) {
+		start_transaction_timeout(trans);
+		conn->blocked = talloc_strdup(conn, node);
+		return true;
+	}
+	return false;
+}
+
+/* Callers get a change node (which can fail) and only commit after they've
+ * finished.  This way they don't have to unwind eg. a write. */
+void add_change_node(struct transaction *trans, const char *node)
+{
+	struct changed_node *i;
+
+	if (!trans)
+		return;
+
+	list_for_each_entry(i, &trans->changes, list)
+		if (streq(i->node, node))
+			return;
+
+	i = talloc(trans, struct changed_node);
+	i->node = talloc_strdup(i, node);
+	INIT_LIST_HEAD(&i->list);
+	list_add_tail(&i->list, &trans->changes);
+}
+
+char *node_dir_inside_transaction(struct transaction *trans, const char *node)
+{
+	return talloc_asprintf(node, "%s%s", trans->divert,
+			       node + strlen(trans->node));
+}
+
+void shortest_transaction_timeout(struct timeval *tv)
+{
+	struct transaction *i;
+
+	list_for_each_entry(i, &transactions, list) {
+		if (!timerisset(&i->timeout))
+			continue;
+
+		if (!timerisset(tv) || timercmp(&i->timeout, tv, <))
+			*tv = i->timeout;
+	}
+}	
+
+void check_transaction_timeout(void)
+{
+	struct transaction *i;
+	struct timeval now;
+
+	gettimeofday(&now, NULL);
+
+	list_for_each_entry(i, &transactions, list) {
+		if (!timerisset(&i->timeout))
+			continue;
+
+		if (timercmp(&i->timeout, &now, <))
+			i->destined_to_fail = true;
+	}
+}
+
+/* FIXME: Eliminate all uses of this */
+static bool do_command(const char *cmd)
+{
+	int ret;
+
+	ret = system(cmd);
+	if (ret == -1)
+		return false;
+	if (!WIFEXITED(ret) || WEXITSTATUS(ret) != 0) {
+		errno = EIO;
+		return false;
+	}
+	return true;
+}
+
+static int destroy_transaction(void *_transaction)
+{
+	struct transaction *trans = _transaction;
+
+	list_del(&trans->list);
+	return destroy_path(trans->divert);
+}
+
+bool do_transaction_start(struct connection *conn, const char *node)
+{
+	struct transaction *transaction;
+	char *dir, *cmd;
+
+	if (conn->transaction)
+		return send_error(conn, EBUSY);
+
+	if (!check_node_perms(conn, node, XS_PERM_READ))
+		return send_error(conn, errno);
+
+	if (transaction_block(conn, node))
+		return true;
+
+	dir = node_dir_outside_transaction(node);
+
+	/* Attach transaction to node for autofree until it's complete */
+	transaction = talloc(node, struct transaction);
+	transaction->node = talloc_strdup(transaction, node);
+	transaction->divert = talloc_asprintf(transaction, "%s/%p/", 
+					      xs_daemon_transactions(),
+					      transaction);
+	cmd = talloc_asprintf(node, "cp -a %s %s", dir, transaction->divert);
+	if (!do_command(cmd))
+		corrupt(conn, "Creating transaction %s", transaction->divert);
+
+	talloc_steal(conn, transaction);
+	INIT_LIST_HEAD(&transaction->changes);
+	transaction->conn = conn;
+	timerclear(&transaction->timeout);
+	transaction->destined_to_fail = false;
+	list_add_tail(&transaction->list, &transactions);
+	conn->transaction = transaction;
+	talloc_set_destructor(transaction, destroy_transaction);
+	return send_ack(transaction->conn, XS_TRANSACTION_START);
+}
+
+static bool commit_transaction(struct transaction *trans)
+{
+	char *tmp, *dir;
+	struct changed_node *i;
+
+	/* Move: orig -> .old, repl -> orig.  Cleanup deletes .old. */
+	dir = node_dir_outside_transaction(trans->node);
+	tmp = talloc_asprintf(trans, "%s.old", dir);
+
+	if (rename(dir, tmp) != 0)
+		return false;
+	if (rename(trans->divert, dir) != 0)
+		corrupt(trans->conn, "Failed rename %s to %s",
+			trans->divert, dir);
+
+	trans->divert = tmp;
+
+	/* Fire off the watches for everything that changed. */
+	list_for_each_entry(i, &trans->changes, list)
+		fire_watches(NULL, i->node);
+	return true;
+}
+
+bool do_transaction_end(struct connection *conn, const char *arg)
+{
+	if (!arg || (!streq(arg, "T") && !streq(arg, "F")))
+		return send_error(conn, EINVAL);
+
+	if (!conn->transaction)
+		return send_error(conn, ENOENT);
+
+	if (streq(arg, "T")) {
+		if (conn->transaction->destined_to_fail) {
+			send_error(conn, ETIMEDOUT);
+			goto failed;
+		}
+		if (!commit_transaction(conn->transaction)) {
+			send_error(conn, errno);
+			goto failed;
+		}
+	}
+
+	talloc_free(conn->transaction);
+	conn->transaction = NULL;
+	return send_ack(conn, XS_TRANSACTION_END);
+
+failed:
+	talloc_free(conn->transaction);
+	conn->transaction = NULL;
+	return false;
+}
+
diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h
new file mode 100644
index 0000000000..a21bccad72
--- /dev/null
+++ b/tools/xenstore/xenstored_transaction.h
@@ -0,0 +1,50 @@
+/* 
+    Transaction code for Xen Store Daemon.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#ifndef _XENSTORED_TRANSACTION_H
+#define _XENSTORED_TRANSACTION_H
+#include "xenstored_core.h"
+
+struct transaction;
+
+bool do_transaction_start(struct connection *conn, const char *node);
+bool do_transaction_end(struct connection *conn, const char *arg);
+
+/* Is node covered by this transaction? */
+bool within_transaction(struct transaction *trans, const char *node);
+
+/* If a write op on this node blocked by another connections' transaction,
+ * mark conn, setup transaction timeout and return true.
+ */
+bool transaction_block(struct connection *conn, const char *node);
+
+/* Return transaction which covers this node. */
+struct transaction *transaction_covering_node(const char *node);
+
+/* Return directory of node within transaction t. */
+char *node_dir_inside_transaction(struct transaction *t, const char *node);
+
+/* This node was changed: can fail and longjmp. */
+void add_change_node(struct transaction *trans, const char *node);
+
+/* Get shortest timeout: leave tv unset if none. */
+void shortest_transaction_timeout(struct timeval *tv);
+
+/* Have any transactions timed out yet? */
+void check_transaction_timeout(void);
+#endif /* _XENSTORED_TRANSACTION_H */
diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c
new file mode 100644
index 0000000000..2df83e1a54
--- /dev/null
+++ b/tools/xenstore/xenstored_watch.c
@@ -0,0 +1,279 @@
+/* 
+    Watch code for Xen Store Daemon.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include "talloc.h"
+#include "list.h"
+#include "xenstored_watch.h"
+#include "xs_lib.h"
+#include "utils.h"
+#include "xenstored_test.h"
+
+/* We create this if anyone is interested "node", then we pass it from
+ * watch to watch as each connection acks it.
+ */
+struct watch_event
+{
+	/* The watch we are firing for (watch->events) */
+	struct list_head list;
+
+	/* Watch we are currently attached to. */
+	struct watch *watch;
+
+	struct buffered_data *data;
+};
+
+struct watch
+{
+	struct list_head list;
+	unsigned int priority;
+
+	/* Current outstanding events applying to this watch. */
+	struct list_head events;
+
+	char *node;
+	struct connection *conn;
+};
+static LIST_HEAD(watches);
+
+static void reset_event(struct watch_event *event)
+{
+	event->data->inhdr = true;
+	event->data->used = 0;
+}
+
+/* We received a non-ACK response: re-queue any watch we just sent. */
+void reset_watch_event(struct connection *conn)
+{
+	if (waiting_for_ack(conn))
+		reset_event(conn->event);
+}
+
+/* We're waiting if we have an event and we sent it all. */
+bool waiting_for_ack(struct connection *conn)
+{
+	if (!conn->event)
+		return false;
+
+	if (conn->event->data->inhdr)
+		return false;
+	return conn->event->data->used == conn->event->data->hdr.msg.len;
+}
+
+bool is_watch_event(struct connection *conn, struct buffered_data *out)
+{
+	return (conn->event && out == conn->event->data);
+}
+
+/* Look through our watches: if any of them have an event, queue it. */
+void queue_next_event(struct connection *conn)
+{
+	struct watch *watch;
+
+	/* We had a reply queued already?  Send it. */
+	if (conn->waiting_reply) {
+		conn->out = conn->waiting_reply;
+		conn->waiting_reply = NULL;
+		return;
+	}
+
+	/* If we're waiting for ack, don't queue more. */
+	if (waiting_for_ack(conn))
+		return;
+
+	/* Find a good event to send. */
+	if (!conn->event) {
+		list_for_each_entry(watch, &watches, list) {
+			if (watch->conn != conn)
+				continue;
+
+			conn->event = list_top(&watch->events,
+					       struct watch_event, list);
+			if (conn->event)
+				break;
+		}
+		if (!conn->event)
+			return;
+	}
+
+	conn->out = conn->event->data;
+}
+
+/* Watch on DIR applies to DIR, DIR/FILE, but not DIRLONG. */
+static bool watch_applies(const struct watch *watch, const char *node)
+{
+	return is_child(node, watch->node);
+}
+
+static struct watch *find_watch(const char *node)
+{
+	struct watch *watch;
+
+	list_for_each_entry(watch, &watches, list) {
+		if (watch_applies(watch, node))
+			return watch;
+	}
+	return NULL;
+}
+
+static struct watch *find_next_watch(struct watch *watch, const char *node)
+{
+	list_for_each_entry_continue(watch, &watches, list) {
+		if (watch_applies(watch, node))
+			return watch;
+	}
+	return NULL;
+}
+
+/* FIXME: we fail to fire on out of memory.  Should drop connections. */
+void fire_watches(struct transaction *trans, const char *node)
+{
+	struct watch *watch;
+	struct watch_event *event;
+
+	/* During transactions, don't fire watches. */
+	if (trans)
+		return;
+
+	watch = find_watch(node);
+	if (!watch)
+		return;
+
+	/* Create and fill in info about event. */
+	event = talloc(talloc_autofree_context(), struct watch_event);
+	event->data = new_buffer(event);
+	event->data->hdr.msg.type = XS_WATCH_EVENT;
+	event->data->hdr.msg.len = strlen(node) + 1;
+	event->data->buffer = talloc_strdup(event->data, node);
+
+	/* Tie event to this watch. */
+	event->watch = watch;
+	list_add(&event->list, &watch->events);
+
+	/* If connection not doing anything, queue this. */
+	if (!watch->conn->out)
+		queue_next_event(watch->conn);
+}
+
+/* We're done with this event: see if anyone else wants it. */
+static void move_event_onwards(struct watch_event *event)
+{
+	list_del(&event->list);
+	reset_event(event);
+
+	/* Remove from this watch, and find next watch to put this on. */
+	event->watch = find_next_watch(event->watch, event->data->buffer);
+	if (!event->watch) {
+		talloc_free(event);
+		return;
+	}
+
+	list_add(&event->list, &event->watch->events);
+
+	/* If connection not doing anything, queue this. */
+	if (!event->watch->conn->out)
+		queue_next_event(event->watch->conn);
+}
+
+static int destroy_watch(void *_watch)
+{
+	struct watch *watch = _watch;
+	struct watch_event *event;
+
+	/* Forget about sending out or waiting for acks for this watch.  */
+	if (watch->conn->event && watch->conn->event->watch == watch)
+		watch->conn->event = NULL;
+
+	/* If we have pending events, pass them on to others. */
+	while ((event = list_top(&watch->events, struct watch_event, list)))
+		move_event_onwards(event);
+
+	/* Remove from global list. */
+	list_del(&watch->list);
+	return 0;
+}
+
+/* We keep watches in priority order. */
+static void insert_watch(struct watch *watch)
+{
+	struct watch *i;
+
+	list_for_each_entry(i, &watches, list) {
+		if (i->priority <= watch->priority) {
+			list_add_tail(&watch->list, &i->list);
+			return;
+		}
+	}
+
+	list_add_tail(&watch->list, &watches);
+}
+
+bool do_watch(struct connection *conn, struct buffered_data *in)
+{
+	struct watch *watch;
+	char *vec[2];
+
+	if (get_strings(in, vec, ARRAY_SIZE(vec)) != ARRAY_SIZE(vec))
+		return send_error(conn, EINVAL);
+
+	if (!check_node_perms(conn, vec[0], XS_PERM_READ))
+		return send_error(conn, errno);
+
+	watch = talloc(conn, struct watch);
+	watch->node = talloc_strdup(watch, vec[0]);
+	watch->conn = conn;
+	watch->priority = strtoul(vec[1], NULL, 0);
+	INIT_LIST_HEAD(&watch->events);
+
+	insert_watch(watch);
+	talloc_set_destructor(watch, destroy_watch);
+	return send_ack(conn, XS_WATCH);
+}
+
+bool do_watch_ack(struct connection *conn)
+{
+	struct watch_event *event;
+
+	if (!waiting_for_ack(conn))
+		return send_error(conn, ENOENT);
+
+	/* Remove this watch event. */
+	event = conn->event;
+	conn->event = NULL;
+
+	move_event_onwards(event);
+	return send_ack(conn, XS_WATCH_ACK);
+}
+
+bool do_unwatch(struct connection *conn, const char *node)
+{
+	struct watch *watch;
+
+	list_for_each_entry(watch, &watches, list) {
+		if (watch->conn == conn
+		    && streq(watch->node, node)) {
+			talloc_free(watch);
+			return send_ack(conn, XS_UNWATCH);
+		}
+	}
+	return send_error(conn, ENOENT);
+}
diff --git a/tools/xenstore/xenstored_watch.h b/tools/xenstore/xenstored_watch.h
new file mode 100644
index 0000000000..656ce4c36b
--- /dev/null
+++ b/tools/xenstore/xenstored_watch.h
@@ -0,0 +1,42 @@
+/* 
+    Watch code for Xen Store Daemon.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#ifndef _XENSTORED_WATCH_H
+#define _XENSTORED_WATCH_H
+#include "xenstored_core.h"
+
+bool do_watch(struct connection *conn, struct buffered_data *in);
+bool do_watch_ack(struct connection *conn);
+bool do_unwatch(struct connection *conn, const char *node);
+
+/* Is this a watch event message for this connection? */
+bool is_watch_event(struct connection *conn, struct buffered_data *out);
+
+/* Look through our watches: if any of them have an event, queue it. */
+void queue_next_event(struct connection *conn);
+
+/* Is this connection waiting for a watch acknowledgement? */
+bool waiting_for_ack(struct connection *conn);
+
+/* Reset event if we were sending one */
+void reset_watch_event(struct connection *conn);
+
+/* Fire all watches. */
+void fire_watches(struct transaction *trans, const char *node);
+
+#endif /* _XENSTORED_WATCH_H */
diff --git a/tools/xenstore/xs.c b/tools/xenstore/xs.c
new file mode 100644
index 0000000000..d5058abfb3
--- /dev/null
+++ b/tools/xenstore/xs.c
@@ -0,0 +1,551 @@
+/* 
+    Xen Store Daemon interface providing simple tree-like database.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#include <signal.h>
+#include <stdint.h>
+#include <errno.h>
+#include "xs.h"
+#include "xenstored.h"
+#include "xs_lib.h"
+#include "utils.h"
+
+struct xs_handle
+{
+	int fd;
+};
+
+/* Get the socket from the store daemon handle.
+ */
+int xs_fileno(struct xs_handle *h)
+{
+	return h->fd;
+}
+
+static struct xs_handle *get_socket(const char *connect_to)
+{
+	struct sockaddr_un addr;
+	int sock, saved_errno;
+	struct xs_handle *h = NULL;
+
+	sock = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (sock < 0)
+		return NULL;
+
+	addr.sun_family = AF_UNIX;
+	strcpy(addr.sun_path, connect_to);
+
+	if (connect(sock, (struct sockaddr *)&addr, sizeof(addr)) == 0) {
+		h = malloc(sizeof(*h));
+		if (h) {
+			h->fd = sock;
+			return h;
+		}
+	}
+
+	saved_errno = errno;
+	close(sock);
+	free(h);
+	errno = saved_errno;
+	return NULL;
+}
+
+struct xs_handle *xs_daemon_open(void)
+{
+	return get_socket(xs_daemon_socket());
+}
+
+struct xs_handle *xs_daemon_open_readonly(void)
+{
+	return get_socket(xs_daemon_socket_ro());
+}
+
+void xs_daemon_close(struct xs_handle *h)
+{
+	if (h->fd >= 0)
+		close(h->fd);
+	free(h);
+}
+
+static bool read_all(int fd, void *data, unsigned int len)
+{
+	while (len) {
+		int done;
+
+		done = read(fd, data, len);
+		if (done < 0) {
+			if (errno == EINTR)
+				continue;
+			return false;
+		}
+		if (done == 0) {
+			/* It closed fd on us?  EBADF is appropriate. */
+			errno = EBADF;
+			return false;
+		}
+		data += done;
+		len -= done;
+	}
+
+	return true;
+}
+
+#ifdef XSTEST
+#define read_all read_all_choice
+#define write_all write_all_choice
+#endif
+
+static int get_error(const char *errorstring)
+{
+	unsigned int i;
+
+	for (i = 0; !streq(errorstring, xsd_errors[i].errstring); i++)
+		if (i == ARRAY_SIZE(xsd_errors) - 1)
+			return EINVAL;
+	return xsd_errors[i].errnum;
+}
+
+static void *read_reply(int fd, enum xsd_sockmsg_type *type, unsigned int *len)
+{
+	struct xsd_sockmsg msg;
+	void *ret;
+	int saved_errno;
+
+	if (!read_all(fd, &msg, sizeof(msg)))
+		return NULL;
+
+	ret = malloc(msg.len);
+	if (!ret)
+		return NULL;
+
+	if (!read_all(fd, ret, msg.len)) {
+		saved_errno = errno;
+		free(ret);
+		errno = saved_errno;
+		return NULL;
+	}
+
+	*type = msg.type;
+	if (len)
+		*len = msg.len;
+	return ret;
+}
+
+/* Send message to xs, get malloc'ed reply.  NULL and set errno on error. */
+static void *xs_talkv(struct xs_handle *h, enum xsd_sockmsg_type type,
+		      const struct iovec *iovec,
+		      unsigned int num_vecs,
+		      unsigned int *len)
+{
+	struct xsd_sockmsg msg;
+	void *ret = NULL;
+	int saved_errno;
+	unsigned int i;
+	struct sigaction ignorepipe, oldact;
+
+	msg.type = type;
+	msg.len = 0;
+	for (i = 0; i < num_vecs; i++)
+		msg.len += iovec[i].iov_len;
+
+	ignorepipe.sa_handler = SIG_IGN;
+	sigemptyset(&ignorepipe.sa_mask);
+	ignorepipe.sa_flags = 0;
+	sigaction(SIGPIPE, &ignorepipe, &oldact);
+
+	if (!write_all(h->fd, &msg, sizeof(msg)))
+		goto fail;
+
+	for (i = 0; i < num_vecs; i++)
+		if (!write_all(h->fd, iovec[i].iov_base, iovec[i].iov_len))
+			goto fail;
+
+	/* Watches can have fired before reply comes: daemon detects
+	 * and re-transmits, so we can ignore this. */
+	do {
+		free(ret);
+		ret = read_reply(h->fd, &msg.type, len);
+		if (!ret)
+			goto fail;
+	} while (msg.type == XS_WATCH_EVENT);
+
+	sigaction(SIGPIPE, &oldact, NULL);
+	if (msg.type == XS_ERROR) {
+		saved_errno = get_error(ret);
+		free(ret);
+		errno = saved_errno;
+		return NULL;
+	}
+
+	assert(msg.type == type);
+	return ret;
+
+fail:
+	/* We're in a bad state, so close fd. */
+	saved_errno = errno;
+	sigaction(SIGPIPE, &oldact, NULL);
+	close(h->fd);
+	h->fd = -1;
+	errno = saved_errno;
+	return NULL;
+}
+
+/* free(), but don't change errno. */
+static void free_no_errno(void *p)
+{
+	int saved_errno = errno;
+	free(p);
+	errno = saved_errno;
+}
+
+/* Simplified version of xs_talkv: single message. */
+static void *xs_single(struct xs_handle *h, enum xsd_sockmsg_type type,
+		       const char *string, unsigned int *len)
+{
+	struct iovec iovec;
+
+	iovec.iov_base = (void *)string;
+	iovec.iov_len = strlen(string) + 1;
+	return xs_talkv(h, type, &iovec, 1, len);
+}
+
+static bool xs_bool(char *reply)
+{
+	if (!reply)
+		return false;
+	free(reply);
+	return true;
+}
+
+char **xs_directory(struct xs_handle *h, const char *path, unsigned int *num)
+{
+	char *strings, *p, **ret;
+	unsigned int len;
+
+	strings = xs_single(h, XS_DIRECTORY, path, &len);
+	if (!strings)
+		return NULL;
+
+	/* Count the strings. */
+	*num = count_strings(strings, len);
+
+	/* Transfer to one big alloc for easy freeing. */
+	ret = malloc(*num * sizeof(char *) + len);
+	if (!ret) {
+		free_no_errno(strings);
+		return NULL;
+	}
+	memcpy(&ret[*num], strings, len);
+	free_no_errno(strings);
+
+	strings = (char *)&ret[*num];
+	for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
+		ret[(*num)++] = p;
+	return ret;
+}
+
+/* Get the value of a single file.
+ * Returns a malloced value: call free() on it after use.
+ * len indicates length in bytes.
+ */
+void *xs_read(struct xs_handle *h, const char *path, unsigned int *len)
+{
+	return xs_single(h, XS_READ, path, len);
+}
+
+/* Write the value of a single file.
+ * Returns false on failure.  createflags can be 0, O_CREAT, or O_CREAT|O_EXCL.
+ */
+bool xs_write(struct xs_handle *h, const char *path,
+	      const void *data, unsigned int len, int createflags)
+{
+	const char *flags;
+	struct iovec iovec[3];
+
+	/* Format: Flags (as string), path, data. */
+	if (createflags == 0)
+		flags = XS_WRITE_NONE;
+	else if (createflags == O_CREAT)
+		flags = XS_WRITE_CREATE;
+	else if (createflags == (O_CREAT|O_EXCL))
+		flags = XS_WRITE_CREATE_EXCL;
+	else {
+		errno = EINVAL;
+		return false;
+	}
+
+	iovec[0].iov_base = (void *)path;
+	iovec[0].iov_len = strlen(path) + 1;
+	iovec[1].iov_base = (void *)flags;
+	iovec[1].iov_len = strlen(flags) + 1;
+	iovec[2].iov_base = (void *)data;
+	iovec[2].iov_len = len;
+
+	return xs_bool(xs_talkv(h, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL));
+}
+
+/* Create a new directory.
+ * Returns false on failure.
+ */
+bool xs_mkdir(struct xs_handle *h, const char *path)
+{
+	return xs_bool(xs_single(h, XS_MKDIR, path, NULL));
+}
+
+/* Destroy a file or directory (directories must be empty).
+ * Returns false on failure.
+ */
+bool xs_rm(struct xs_handle *h, const char *path)
+{
+	return xs_bool(xs_single(h, XS_RM, path, NULL));
+}
+
+/* Get permissions of node (first element is owner).
+ * Returns malloced array, or NULL: call free() after use.
+ */
+struct xs_permissions *xs_get_permissions(struct xs_handle *h,
+					  const char *path,
+					  unsigned int *num)
+{
+	char *strings;
+	unsigned int len;
+	struct xs_permissions *ret;
+
+	strings = xs_single(h, XS_GET_PERMS, path, &len);
+	if (!strings)
+		return NULL;
+
+	/* Count the strings: each one perms then domid. */
+	*num = count_strings(strings, len);
+
+	/* Transfer to one big alloc for easy freeing. */
+	ret = malloc(*num * sizeof(struct xs_permissions));
+	if (!ret) {
+		free_no_errno(strings);
+		return NULL;
+	}
+
+	if (!strings_to_perms(ret, *num, strings)) {
+		free_no_errno(ret);
+		ret = NULL;
+	}
+
+	free(strings);
+	return ret;
+}
+
+/* Set permissions of node (must be owner).
+ * Returns false on failure.
+ */
+bool xs_set_permissions(struct xs_handle *h, const char *path,
+			struct xs_permissions *perms,
+			unsigned int num_perms)
+{
+	unsigned int i;
+	struct iovec iov[1+num_perms];
+
+	iov[0].iov_base = (void *)path;
+	iov[0].iov_len = strlen(path) + 1;
+	
+	for (i = 0; i < num_perms; i++) {
+		char buffer[MAX_STRLEN(domid_t)+1];
+
+		if (!perm_to_string(&perms[i], buffer))
+			goto unwind;
+
+		iov[i+1].iov_base = strdup(buffer);
+		iov[i+1].iov_len = strlen(buffer) + 1;
+		if (!iov[i+1].iov_base)
+			goto unwind;
+	}
+
+	if (!xs_bool(xs_talkv(h, XS_SET_PERMS, iov, 1+num_perms, NULL)))
+		goto unwind;
+	for (i = 0; i < num_perms; i++)
+		free(iov[i+1].iov_base);
+	return true;
+
+unwind:
+	num_perms = i;
+	for (i = 0; i < num_perms; i++)
+		free_no_errno(iov[i+1].iov_base);
+	return false;
+}
+
+/* Watch a node for changes (poll on fd to detect, or call read_watch()).
+ * When the node (or any child) changes, fd will become readable.
+ * Priority indicates order if multiple watchers: higher is first.
+ * Returns false on failure.
+ */
+bool xs_watch(struct xs_handle *h, const char *path, unsigned int priority)
+{
+	char prio[MAX_STRLEN(priority)];
+	struct iovec iov[2];
+
+	sprintf(prio, "%u", priority);
+	iov[0].iov_base = (void *)path;
+	iov[0].iov_len = strlen(path) + 1;
+	iov[1].iov_base = prio;
+	iov[1].iov_len = strlen(prio) + 1;
+
+	return xs_bool(xs_talkv(h, XS_WATCH, iov, ARRAY_SIZE(iov), NULL));
+}
+
+/* Find out what node change was on (will block if nothing pending).
+ * Returns malloced path, or NULL: call free() after use.
+ */
+char *xs_read_watch(struct xs_handle *h)
+{
+	struct xsd_sockmsg msg;
+	char *path;
+
+	if (!read_all(h->fd, &msg, sizeof(msg)))
+		return NULL;
+
+	assert(msg.type == XS_WATCH_EVENT);
+	path = malloc(msg.len);
+	if (!path)
+		return NULL;
+
+	if (!read_all(h->fd, path, msg.len)) {
+		free_no_errno(path);
+		return NULL;
+	}
+	return path;
+}
+
+/* Acknowledge watch on node.  Watches must be acknowledged before
+ * any other watches can be read.
+ * Returns false on failure.
+ */
+bool xs_acknowledge_watch(struct xs_handle *h)
+{
+	return xs_bool(xs_single(h, XS_WATCH_ACK, "OK", NULL));
+}
+
+/* Remove a watch on a node.
+ * Returns false on failure (no watch on that node).
+ */
+bool xs_unwatch(struct xs_handle *h, const char *path)
+{
+	return xs_bool(xs_single(h, XS_UNWATCH, path, NULL));
+}
+
+/* Start a transaction: changes by others will not be seen during this
+ * transaction, and changes will not be visible to others until end.
+ * Transaction only applies to the given subtree.
+ * You can only have one transaction at any time.
+ * Returns false on failure.
+ */
+bool xs_transaction_start(struct xs_handle *h, const char *subtree)
+{
+	return xs_bool(xs_single(h, XS_TRANSACTION_START, subtree, NULL));
+}
+
+/* End a transaction.
+ * If abandon is true, transaction is discarded instead of committed.
+ * Returns false on failure, which indicates an error: transactions will
+ * not fail spuriously.
+ */
+bool xs_transaction_end(struct xs_handle *h, bool abort)
+{
+	char abortstr[2];
+
+	if (abort)
+		strcpy(abortstr, "F");
+	else
+		strcpy(abortstr, "T");
+	return xs_bool(xs_single(h, XS_TRANSACTION_END, abortstr, NULL));
+}
+
+/* Introduce a new domain.
+ * This tells the store daemon about a shared memory page and event channel
+ * associated with a domain: the domain uses these to communicate.
+ */
+bool xs_introduce_domain(struct xs_handle *h,
+			 domid_t domid,
+			 unsigned long mfn,
+			 unsigned int eventchn,
+			 const char *path)
+{
+	char domid_str[MAX_STRLEN(domid)];
+	char mfn_str[MAX_STRLEN(mfn)];
+	char eventchn_str[MAX_STRLEN(eventchn)];
+	struct iovec iov[4];
+
+	sprintf(domid_str, "%u", domid);
+	sprintf(mfn_str, "%lu", mfn);
+	sprintf(eventchn_str, "%u", eventchn);
+
+	iov[0].iov_base = domid_str;
+	iov[0].iov_len = strlen(domid_str) + 1;
+	iov[1].iov_base = mfn_str;
+	iov[1].iov_len = strlen(mfn_str) + 1;
+	iov[2].iov_base = eventchn_str;
+	iov[2].iov_len = strlen(eventchn_str) + 1;
+	iov[3].iov_base = (char *)path;
+	iov[3].iov_len = strlen(path) + 1;
+
+	return xs_bool(xs_talkv(h, XS_INTRODUCE, iov, ARRAY_SIZE(iov), NULL));
+}
+
+bool xs_release_domain(struct xs_handle *h,
+		       domid_t domid)
+{
+	char domid_str[MAX_STRLEN(domid)];
+
+	sprintf(domid_str, "%u", domid);
+
+	return xs_bool(xs_single(h, XS_RELEASE, domid_str, NULL));
+}
+
+bool xs_shutdown(struct xs_handle *h)
+{
+	bool ret = xs_bool(xs_single(h, XS_SHUTDOWN, "", NULL));
+	if (ret) {
+		char c;
+		/* Wait for it to actually shutdown. */
+		read(h->fd, &c, 1);
+	}
+	return ret;
+}
+
+/* Only useful for DEBUG versions */
+char *xs_debug_command(struct xs_handle *h, const char *cmd,
+		       void *data, unsigned int len)
+{
+	struct iovec iov[2];
+
+	iov[0].iov_base = (void *)cmd;
+	iov[0].iov_len = strlen(cmd) + 1;
+	iov[1].iov_base = data;
+	iov[1].iov_len = len;
+
+	return xs_talkv(h, XS_DEBUG, iov, ARRAY_SIZE(iov), NULL);
+}
diff --git a/tools/xenstore/xs.h b/tools/xenstore/xs.h
new file mode 100644
index 0000000000..ff9481c3a6
--- /dev/null
+++ b/tools/xenstore/xs.h
@@ -0,0 +1,146 @@
+#ifndef _XS_H
+#define _XS_H
+/* 
+    Xen Store Daemon providing simple tree-like database.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+/* On failure, these routines set errno. */
+#include "xs_lib.h"
+
+struct xs_handle;
+
+/* Connect to the xs daemon.
+ * Returns a handle or NULL.
+ */
+struct xs_handle *xs_daemon_open(void);
+
+/* Connect to the xs daemon (readonly for non-root clients).
+ * Returns a handle or NULL.
+ */
+struct xs_handle *xs_daemon_open_readonly(void);
+
+/* Close the connection to the xs daemon. */
+void xs_daemon_close(struct xs_handle *);
+
+/* Get contents of a directory.
+ * Returns a malloced array: call free() on it after use.
+ * Num indicates size.
+ */
+char **xs_directory(struct xs_handle *h, const char *path, unsigned int *num);
+
+/* Get the value of a single file.
+ * Returns a malloced value: call free() on it after use.
+ * len indicates length in bytes.
+ */
+void *xs_read(struct xs_handle *h, const char *path, unsigned int *len);
+
+/* Write the value of a single file.
+ * Returns false on failure.  createflags can be 0, O_CREAT, or O_CREAT|O_EXCL.
+ */
+bool xs_write(struct xs_handle *h, const char *path, const void *data, unsigned int len,
+	      int createflags);
+
+/* Create a new directory.
+ * Returns false on failure.
+ */
+bool xs_mkdir(struct xs_handle *h, const char *path);
+
+/* Destroy a file or directory (and children).
+ * Returns false on failure.
+ */
+bool xs_rm(struct xs_handle *h, const char *path);
+
+/* Get permissions of node (first element is owner, first perms is "other").
+ * Returns malloced array, or NULL: call free() after use.
+ */
+struct xs_permissions *xs_get_permissions(struct xs_handle *h,
+					  const char *path,
+					  unsigned int *num);
+
+/* Set permissions of node (must be owner).
+ * Returns false on failure.
+ */
+bool xs_set_permissions(struct xs_handle *h,
+			const char *path,
+			struct xs_permissions *perms,
+			unsigned int num_perms);
+
+/* Watch a node for changes (poll on fd to detect, or call read_watch()).
+ * When the node (or any child) changes, fd will become readable.
+ * Priority indicates order if multiple watchers: higher is first.
+ * Returns false on failure.
+ */
+bool xs_watch(struct xs_handle *h, const char *path, unsigned int priority);
+
+/* Return the FD to poll on to see if a watch has fired. */
+int xs_fileno(struct xs_handle *h);
+
+/* Find out what node change was on (will block if nothing pending).
+ * Returns malloced path, or NULL: call free() after use.
+ */
+char *xs_read_watch(struct xs_handle *h);
+
+/* Acknowledge watch on node.  Watches must be acknowledged before
+ * any other watches can be read.
+ * Returns false on failure.
+ */
+bool xs_acknowledge_watch(struct xs_handle *h);
+
+/* Remove a watch on a node.
+ * Returns false on failure (no watch on that node).
+ */
+bool xs_unwatch(struct xs_handle *h, const char *path);
+
+/* Start a transaction: changes by others will not be seen during this
+ * transaction, and changes will not be visible to others until end.
+ * Transaction only applies to the given subtree.
+ * You can only have one transaction at any time.
+ * Returns false on failure.
+ */
+bool xs_transaction_start(struct xs_handle *h, const char *subtree);
+
+/* End a transaction.
+ * If abandon is true, transaction is discarded instead of committed.
+ * Returns false on failure, which indicates an error: transactions will
+ * not fail spuriously.
+ */
+bool xs_transaction_end(struct xs_handle *h, bool abort);
+
+/* Introduce a new domain.
+ * This tells the store daemon about a shared memory page, event channel
+ * and store path associated with a domain: the domain uses these to communicate.
+ */
+bool xs_introduce_domain(struct xs_handle *h,
+                         domid_t domid,
+                         unsigned long mfn,
+                         unsigned int eventchn,
+                         const char *path);
+
+/* Release a domain.
+ * Tells the store domain to release the memory page to the domain.
+ */
+bool xs_release_domain(struct xs_handle *h, domid_t domid);
+
+/* Only useful for DEBUG versions */
+char *xs_debug_command(struct xs_handle *h, const char *cmd,
+		       void *data, unsigned int len);
+
+/* Shut down the daemon. */
+bool xs_shutdown(struct xs_handle *h);
+
+#endif /* _XS_H */
diff --git a/tools/xenstore/xs_lib.c b/tools/xenstore/xs_lib.c
new file mode 100644
index 0000000000..8630eaffce
--- /dev/null
+++ b/tools/xenstore/xs_lib.c
@@ -0,0 +1,141 @@
+#include "xs_lib.h"
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+
+/* Common routines for the Xen store daemon and client library. */
+
+static const char *xs_daemon_rootdir(void)
+{
+	char *s = getenv("XENSTORED_ROOTDIR");
+	return (s ? s : "/var/lib/xenstored");
+}
+
+static const char *xs_daemon_rundir(void)
+{
+	char *s = getenv("XENSTORED_RUNDIR");
+	return (s ? s : "/var/run/xenstored");
+}
+
+const char *xs_daemon_socket(void)
+{
+	static char buf[PATH_MAX];
+	sprintf(buf, "%s/socket", xs_daemon_rundir());
+	return buf;
+}
+
+const char *xs_daemon_socket_ro(void)
+{
+	static char buf[PATH_MAX];
+	sprintf(buf, "%s/socket_ro", xs_daemon_rundir());
+	return buf;
+}
+
+const char *xs_daemon_store(void)
+{
+	static char buf[PATH_MAX];
+	sprintf(buf, "%s/store", xs_daemon_rootdir());
+	return buf;
+}
+
+const char *xs_daemon_transactions(void)
+{
+	static char buf[PATH_MAX];
+	sprintf(buf, "%s/transactions", xs_daemon_rootdir());
+	return buf;
+}
+
+/* Simple routines for writing to sockets, etc. */
+bool write_all(int fd, const void *data, unsigned int len)
+{
+	while (len) {
+		int done;
+
+		done = write(fd, data, len);
+		if (done < 0 && errno == EINTR)
+			continue;
+		if (done <= 0)
+			return false;
+		data += done;
+		len -= done;
+	}
+
+	return true;
+}
+
+/* Convert strings to permissions.  False if a problem. */
+bool strings_to_perms(struct xs_permissions *perms, unsigned int num,
+		      const char *strings)
+{
+	const char *p;
+	char *end;
+	unsigned int i;
+
+	for (p = strings, i = 0; i < num; i++) {
+		/* "r", "w", or "b" for both. */
+		switch (*p) {
+		case 'r':
+			perms[i].perms = XS_PERM_READ;
+			break;
+		case 'w':
+			perms[i].perms = XS_PERM_WRITE;
+			break;
+		case 'b':
+			perms[i].perms = XS_PERM_READ|XS_PERM_WRITE;
+			break;
+		case 'n':
+			perms[i].perms = XS_PERM_NONE;
+			break;
+		default:
+			errno = EINVAL;
+			return false;
+		} 
+		p++;
+		perms[i].id = strtol(p, &end, 0);
+		if (*end || !*p) {
+			errno = EINVAL;
+			return false;
+		}
+		p = end + 1;
+	}
+	return true;
+}
+
+/* Convert permissions to a string (up to len MAX_STRLEN(domid_t)+1). */
+bool perm_to_string(const struct xs_permissions *perm, char *buffer)
+{
+	switch (perm->perms) {
+	case XS_PERM_WRITE:
+		*buffer = 'w';
+		break;
+	case XS_PERM_READ:
+		*buffer = 'r';
+		break;
+	case XS_PERM_READ|XS_PERM_WRITE:
+		*buffer = 'b';
+		break;
+	case XS_PERM_NONE:
+		*buffer = 'n';
+		break;
+	default:
+		errno = EINVAL;
+		return false;
+	}
+	sprintf(buffer+1, "%i", (int)perm->id);
+	return true;
+}
+
+/* Given a string and a length, count how many strings (nul terms). */
+unsigned int count_strings(const char *strings, unsigned int len)
+{
+	unsigned int num;
+	const char *p;
+
+	for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1)
+		num++;
+
+	return num;
+}
+	
diff --git a/tools/xenstore/xs_lib.h b/tools/xenstore/xs_lib.h
new file mode 100644
index 0000000000..a946ab0b19
--- /dev/null
+++ b/tools/xenstore/xs_lib.h
@@ -0,0 +1,63 @@
+#ifndef _XR_LIB_H
+#define _XR_LIB_H
+/* 
+    Common routines between Xen store user library and daemon.
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <stdbool.h>
+#include <limits.h>
+#include <xc.h>
+
+/* Bitmask of permissions. */
+enum xs_perm_type {
+	XS_PERM_NONE = 0,
+	XS_PERM_READ = 1,
+	XS_PERM_WRITE = 2,
+	/* Internal use. */
+	XS_PERM_CREATE = 4,
+	XS_PERM_OWNER = 8,
+};
+
+struct xs_permissions
+{
+	domid_t id;
+	enum xs_perm_type perms;
+};
+
+/* Each 10 bits takes ~ 3 digits, plus one, plus one for nul terminator. */
+#define MAX_STRLEN(x) ((sizeof(x) * CHAR_BIT + CHAR_BIT-1) / 10 * 3 + 2)
+
+/* Path for various daemon things: env vars can override. */
+const char *xs_daemon_socket(void);
+const char *xs_daemon_socket_ro(void);
+const char *xs_daemon_store(void);
+const char *xs_daemon_transactions(void);
+
+/* Simple write function: loops for you. */
+bool write_all(int fd, const void *data, unsigned int len);
+
+/* Convert strings to permissions.  False if a problem. */
+bool strings_to_perms(struct xs_permissions *perms, unsigned int num,
+		      const char *strings);
+
+/* Convert permissions to a string (up to len MAX_STRLEN(domid_t)+1). */
+bool perm_to_string(const struct xs_permissions *perm, char *buffer);
+
+/* Given a string and a length, count how many strings (nul terms). */
+unsigned int count_strings(const char *strings, unsigned int len);
+
+#endif /* _XS_LIB_H */
diff --git a/tools/xenstore/xs_random.c b/tools/xenstore/xs_random.c
new file mode 100644
index 0000000000..ef5d44d0b0
--- /dev/null
+++ b/tools/xenstore/xs_random.c
@@ -0,0 +1,1646 @@
+/* Random tests.
+
+   We check that the results from a real filesystem are the same.
+*/
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include "xs.h"
+#include "talloc.h"
+#include "utils.h"
+
+struct ops
+{
+	char *name;
+
+	char **(*dir)(void *h, const char *path, unsigned int *num);
+
+	void *(*read)(void *h, const char *path, unsigned int *len);
+
+	bool (*write)(void *h, const char *path, const void *data,
+		      unsigned int len, int createflags);
+
+	bool (*mkdir)(void *h, const char *path);
+
+	bool (*rm)(void *h, const char *path);
+
+	struct xs_permissions *(*get_perms)(void *h,
+					    const char *path,
+					    unsigned int *num);
+
+	bool (*set_perms)(void *h,
+			  const char *path,
+			  struct xs_permissions *perms,
+			  unsigned int num);
+
+	bool (*transaction_start)(void *h, const char *subtree);
+	bool (*transaction_end)(void *h, bool abort);
+
+	/* Create and destroy a new handle. */
+	void *(*handle)(const char *path);
+	void (*close)(void *);
+};
+
+struct file_ops_info
+{
+	const char *base;
+	char *transact_base;
+	char *transact;
+};
+
+static void convert_to_dir(const char *dirname)
+{
+	char *tmpname = talloc_asprintf(dirname, "%s.tmp", dirname);
+	if (rename(dirname, tmpname) != 0)
+		barf_perror("Failed to rename %s to %s", dirname, tmpname);
+	if (mkdir(dirname, 0700) != 0) 
+		barf_perror("Failed to mkdir %s", dirname);
+	if (rename(tmpname,talloc_asprintf(dirname, "%s/.DATA", dirname)) != 0)
+		barf_perror("Failed to rename into %s", dirname);
+	/* If perms exists, move it in. */
+	rename(talloc_asprintf(dirname, "%s.perms", dirname),
+	       talloc_asprintf(dirname, "%s/.perms", dirname));
+}
+
+/* Files can be used as dirs, too.  Convert them when they are. */
+static void maybe_convert_to_directory(const char *filename)
+{
+	struct stat st;
+	char *dirname = talloc_asprintf(filename, "%.*s", 
+					strrchr(filename, '/') - filename,
+					filename);
+	if (lstat(dirname, &st) == 0 && S_ISREG(st.st_mode))
+		convert_to_dir(dirname);
+}
+
+static char *get_name(struct file_ops_info *info, const char *path)
+{
+	if (info->transact_base)
+		return talloc_asprintf(path, "%s%s", info->transact_base,
+				       path);
+	return talloc_asprintf(path, "%s%s", info->base, path);
+}
+
+static char *path_to_name(struct file_ops_info *info, const char *path)
+{
+	char *filename = get_name(info, path);
+	maybe_convert_to_directory(filename);
+	return filename;
+}
+
+/* Is child a subnode of parent, or equal? */
+static bool is_child(const char *child, const char *parent)
+{
+	unsigned int len = strlen(parent);
+
+	/* / should really be "" for this algorithm to work, but that's a
+	 * usability nightmare. */
+	if (streq(parent, "/"))
+		return true;
+
+	if (strncmp(child, parent, len) != 0)
+		return false;
+
+	return child[len] == '/' || child[len] == '\0';
+}
+
+static bool write_ok(struct file_ops_info *info, const char *path)
+{
+	if (info->transact && !is_child(path, info->transact)) {
+		errno = EROFS;
+		return false;
+	}
+	return true;
+}	
+
+static char **file_directory(struct file_ops_info *info,
+			     const char *path, unsigned int *num)
+{
+	char **ret;
+	DIR *dir;
+	struct dirent *dirent;
+	char *p, *dirname = path_to_name(info, path);
+	unsigned int i, len = 0;
+	struct stat st;
+
+	/* If it exists, but isn't a directory, we convert it. */
+	if (lstat(dirname, &st) == 0 && !S_ISDIR(st.st_mode))
+		convert_to_dir(dirname);
+
+	*num = 0;
+	dir = opendir(dirname);
+	if (!dir)
+		return NULL;;
+
+	/* Once to count them. */
+	while ((dirent = readdir(dir)) != NULL) {
+		if (strchr(dirent->d_name, '.'))
+			continue;
+		len += strlen(dirent->d_name) + 1;
+		(*num)++;
+	}
+	rewinddir(dir);
+
+	/* Now allocate and fill in. */
+	ret = malloc(sizeof(char *) * *num + len);
+	p = (char *)&ret[*num];
+	i = 0;
+	while ((dirent = readdir(dir)) != NULL) {
+		if (strchr(dirent->d_name, '.'))
+			continue;
+		ret[i] = p;
+		strcpy(p, dirent->d_name);
+		p += strlen(p) + 1;
+		i++;
+	}
+	closedir(dir);
+
+	return ret;
+}
+
+static char *filename_to_data(const char *filename)
+{
+	struct stat st;
+
+	if (lstat(filename, &st) == 0 && S_ISDIR(st.st_mode))
+		return talloc_asprintf(filename, "%s/.DATA", filename);
+	return (char *)filename;
+}
+
+static void *file_read(struct file_ops_info *info,
+		       const char *path, unsigned int *len)
+{
+	void *ret;
+	char *filename = filename_to_data(path_to_name(info, path));
+	unsigned long size;
+
+	ret = grab_file(filename, &size);
+	/* Directory exists, .DATA doesn't. */
+	if (!ret && errno == ENOENT && strends(filename, ".DATA"))
+		errno = EISDIR;
+	*len = size;
+	return ret;
+}
+
+static struct xs_permissions *file_get_perms(struct file_ops_info *info,
+					     const char *path,
+					     unsigned int *num)
+{
+	void *perms;
+	struct xs_permissions *ret;
+	char *filename = path_to_name(info, path);
+	char *permfile;
+	unsigned long size;
+	struct stat st;
+
+	/* No permfile: we didn't bother, return defaults. */
+	if (lstat(filename, &st) != 0)
+		return NULL;
+
+	if (S_ISDIR(st.st_mode)) 
+		permfile = talloc_asprintf(path, "%s/.perms", filename);
+	else
+		permfile = talloc_asprintf(path, "%s.perms", filename);
+
+	perms = grab_file(permfile, &size);
+	if (!perms) {
+		ret = new(struct xs_permissions);
+		ret[0].id = 0;
+		/* Default for root is readable. */
+		if (streq(path, "/"))
+			ret[0].perms = XS_PERM_READ;
+		else
+			ret[0].perms = XS_PERM_NONE;
+		*num = 1;
+		release_file(perms, size);
+		return ret;
+	}
+	*num = count_strings(perms, size);
+
+	ret = new_array(struct xs_permissions, *num);
+	if (!strings_to_perms(ret, *num, perms))
+		barf("Reading permissions from %s", permfile);
+	release_file(perms, size);
+	return ret;
+}
+
+static bool file_set_perms(struct file_ops_info *info,
+			   const char *path,
+			   struct xs_permissions *perms,
+			   unsigned int num)
+{
+	unsigned int i;
+	char *filename = path_to_name(info, path);
+	char *permfile;
+	int fd;
+	struct stat st;
+
+	if (num < 1) {
+		errno = EINVAL;
+		return false;
+	}
+
+	if (!write_ok(info, path))
+		return false;
+
+	/* Check non-perm file exists/ */
+	if (lstat(filename, &st) != 0)
+		return false;
+
+	if (S_ISDIR(st.st_mode)) 
+		permfile = talloc_asprintf(path, "%s/.perms", filename);
+	else
+		permfile = talloc_asprintf(path, "%s.perms", filename);
+
+	fd = open(permfile, O_WRONLY|O_CREAT|O_TRUNC, 0600);
+	if (fd < 0)
+		return false;
+
+	for (i = 0; i < num; i++) {
+		char buffer[100];
+
+		if (!perm_to_string(&perms[i], buffer)) {
+			int saved_errno = errno;
+			close(fd);
+			errno = saved_errno;
+			return false;
+		}
+		if (write(fd, buffer, strlen(buffer) + 1)
+		    != (int)strlen(buffer) + 1)
+			barf_perror("Failed to write perm");
+	}
+	close(fd);
+	return true;
+}
+
+static bool file_write(struct file_ops_info *info,
+		       const char *path, const void *data,
+		       unsigned int len, int createflags)
+{
+	char *filename = filename_to_data(path_to_name(info, path));
+	int fd;
+
+	/* Kernel isn't strict, but library is. */
+	if (createflags & ~(O_CREAT|O_EXCL)) {
+		errno = EINVAL;
+		return false;
+	}
+
+	if (!write_ok(info, path))
+		return false;
+
+	/* We regard it as existing if dir exists. */
+	if (strends(filename, ".DATA")) {
+		if (!createflags)
+			createflags = O_CREAT;
+		if (createflags & O_EXCL) {
+			errno = EEXIST;
+			return false;
+		}
+	}
+
+	fd = open(filename, createflags|O_TRUNC|O_WRONLY, 0600);
+	if (fd < 0) {
+		/* FIXME: Another hack. */
+		if (!(createflags & O_CREAT) && errno == EISDIR)
+			errno = EEXIST;
+		return false;
+	}
+
+	if (write(fd, data, len) != (int)len)
+		barf_perror("Bad write to %s", filename);
+
+	close(fd);
+	return true;
+}
+
+static bool file_mkdir(struct file_ops_info *info, const char *path)
+{
+	char *dirname = path_to_name(info, path);
+
+	/* Same effective order as daemon, so error returns are right. */
+	if (mkdir(dirname, 0700) != 0) {
+		if (errno != ENOENT && errno != ENOTDIR)
+			write_ok(info, path);
+		return false;
+	}
+
+	if (!write_ok(info, path)) {
+		int saved_errno = errno;
+		rmdir(dirname);
+		errno = saved_errno;
+		return false;
+	}
+	return true;
+}
+
+static void do_command(const char *cmd)
+{
+	int ret;
+
+	ret = system(cmd);
+	if (ret == -1 || !WIFEXITED(ret) || WEXITSTATUS(ret) != 0)
+		barf_perror("Failed '%s': %i", cmd, ret);
+}
+
+static bool file_rm(struct file_ops_info *info, const char *path)
+{
+	char *filename = path_to_name(info, path);
+	struct stat st;
+
+	if (info->transact && streq(info->transact, path)) {
+		errno = EINVAL;
+		return false;
+	}
+
+	if (lstat(filename, &st) != 0)
+		return false;
+
+	if (!write_ok(info, path))
+		return false;
+
+	if (streq(path, "/")) {
+		errno = EINVAL;
+		return false;
+	}
+
+	do_command(talloc_asprintf(path, "rm -f %s.perms; rm -r %s", 
+				   filename, filename));
+	return true;
+}
+
+static bool file_transaction_start(struct file_ops_info *info,
+				   const char *subtree)
+{
+	char *cmd;
+	char *filename = path_to_name(info, subtree);
+	struct stat st;
+
+	if (info->transact) {
+		errno = EBUSY;
+		return false;
+	}
+
+	if (lstat(filename, &st) != 0)
+		return false;
+
+	cmd = talloc_asprintf(NULL, "cp -r %s %s.transact",
+			      info->base, info->base);
+	do_command(cmd);
+	talloc_free(cmd);
+
+	info->transact_base = talloc_asprintf(NULL, "%s.transact", info->base);
+	info->transact = talloc_strdup(NULL, subtree);
+	return true;
+}
+
+static bool file_transaction_end(struct file_ops_info *info, bool abort)
+{
+	char *old, *cmd;
+
+	if (!info->transact) {
+		errno = ENOENT;
+		return false;
+	}
+
+	if (abort) {
+		cmd = talloc_asprintf(NULL, "rm -r %s", info->transact_base);
+		do_command(cmd);
+		goto success;
+	}
+
+	old = talloc_asprintf(NULL, "rm -rf %s", info->base);
+	do_command(old);
+	talloc_free(old);
+
+	cmd = talloc_asprintf(NULL, "mv %s %s",
+			      info->transact_base, info->base);
+	do_command(cmd);
+
+success:
+	talloc_free(cmd);
+	talloc_free(info->transact);
+	talloc_free(info->transact_base);
+	info->transact = NULL;
+	info->transact_base = NULL;
+	return true;
+}
+
+static struct file_ops_info *file_handle(const char *dir)
+{
+	struct file_ops_info *info = talloc(NULL, struct file_ops_info);
+
+	info->base = dir;
+	info->transact_base = NULL;
+	info->transact = NULL;
+	return info;
+}
+
+static void file_close(struct file_ops_info *handle)
+{
+	talloc_free(handle);
+}
+
+static struct xs_handle *xs_handle(const char *dir __attribute__((unused)))
+{
+	struct xs_handle *h;
+
+	h = xs_daemon_open();
+	if (!h)
+		barf_perror("Connecting to xs daemon");
+	return h;
+}
+
+static void xs_close(struct xs_handle *handle)
+{
+	xs_daemon_close(handle);
+}
+
+struct ops file_ops = {
+	.name = "FILE",
+	.dir = (void *)file_directory,
+	.read = (void *)file_read,
+	.write = (void *)file_write,
+	.mkdir = (void *)file_mkdir,
+	.rm = (void *)file_rm,
+	.get_perms = (void *)file_get_perms,
+	.set_perms = (void *)file_set_perms,
+	.transaction_start = (void *)file_transaction_start,
+	.transaction_end = (void *)file_transaction_end,
+	.handle = (void *)file_handle,
+	.close = (void *)file_close,
+};
+
+struct ops xs_ops = {
+	.name = "XS",
+	.dir = (void *)xs_directory,
+	.read = (void *)xs_read,
+	.write = (void *)xs_write,
+	.mkdir = (void *)xs_mkdir,
+	.rm = (void *)xs_rm,
+	.get_perms = (void *)xs_get_permissions,
+	.set_perms = (void *)xs_set_permissions,
+	.transaction_start = (void *)xs_transaction_start,
+	.transaction_end = (void *)xs_transaction_end,
+	.handle = (void *)xs_handle,
+	.close = (void *)xs_close,
+};
+
+static int strptrcmp(const void *a, const void *b)
+{
+	return strcmp(*(char **)a, *(char **)b);
+}
+
+static void sort_dir(char **dir, unsigned int num)
+{
+	qsort(dir, num, sizeof(char *), strptrcmp);
+}
+
+static char *dump_dir(struct ops *ops,
+		      void *h,
+		      const char *node,
+		      char **dir,
+		      unsigned int numdirs,
+		      unsigned int depth)
+{
+	char *ret = talloc_strdup(node, "");
+	unsigned int i;
+	char spacing[depth+1];
+
+	memset(spacing, ' ', depth);
+	spacing[depth] = '\0';
+
+	sort_dir(dir, numdirs);
+
+	for (i = 0; i < numdirs; i++) {
+		struct xs_permissions *perms;
+		unsigned int j, numperms;
+		unsigned int len;
+		char *contents;
+		unsigned int subnum;
+		char **subdirs;
+		char *subret;
+		char *subnode = talloc_asprintf(node, "%s/%s", node, dir[i]);
+
+		perms = ops->get_perms(h, subnode, &numperms);
+		if (!perms)
+			return NULL;
+		ret = talloc_asprintf_append(ret, "%s%s: ", spacing, dir[i]);
+		for (j = 0; j < numperms; j++) {
+			char buffer[100];
+			if (!perm_to_string(&perms[j], buffer))
+				barf("perm to string");
+			ret = talloc_asprintf_append(ret, "%s ", buffer);
+		}
+		free(perms);
+		ret = talloc_asprintf_append(ret, "\n");
+
+		/* Even directories can have contents. */
+		contents = ops->read(h, subnode, &len);
+		if (!contents) {
+			if (errno != EISDIR)
+				return NULL;
+		} else {
+			ret = talloc_asprintf_append(ret, " %s(%.*s)\n",
+						     spacing, len, contents);
+			free(contents);
+		}			
+
+		/* Every node is a directory. */
+		subdirs = ops->dir(h, subnode, &subnum);
+		if (!subdirs)
+			return NULL;
+		subret = dump_dir(ops, h, subnode, subdirs, subnum, depth+1);
+		if (!subret)
+			return NULL;
+		ret = talloc_asprintf_append(ret, "%s", subret);
+		free(subdirs);
+	}
+	return ret;
+}
+
+static char *dump(struct ops *ops, void *h)
+{
+	char **subdirs;
+	unsigned int subnum;
+	char *ret = NULL, *root = talloc_strdup(NULL, "/");
+
+	subdirs = ops->dir(h, root, &subnum);
+	if (subdirs) {
+		ret = dump_dir(ops, h, talloc_strdup(root, ""), subdirs,
+			       subnum, 0);
+		free(subdirs);
+		if (ret)
+			talloc_steal(NULL, ret);
+	}
+	talloc_free(root);
+	return ret;
+}
+
+/* jhash.h: Jenkins hash support.
+ *
+ * Copyright (C) 1996 Bob Jenkins (bob_jenkins@burtleburtle.net)
+ *
+ * http://burtleburtle.net/bob/hash/
+ *
+ * These are the credits from Bob's sources:
+ *
+ * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
+ * hash(), hash2(), hash3, and mix() are externally useful functions.
+ * Routines to test the hash are included if SELF_TEST is defined.
+ * You can use this free for any purpose.  It has no warranty.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ *
+ * I've modified Bob's hash to be useful in the Linux kernel, and
+ * any bugs present are surely my fault.  -DaveM
+ */
+
+/* NOTE: Arguments are modified. */
+#define __jhash_mix(a, b, c) \
+{ \
+  a -= b; a -= c; a ^= (c>>13); \
+  b -= c; b -= a; b ^= (a<<8); \
+  c -= a; c -= b; c ^= (b>>13); \
+  a -= b; a -= c; a ^= (c>>12);  \
+  b -= c; b -= a; b ^= (a<<16); \
+  c -= a; c -= b; c ^= (b>>5); \
+  a -= b; a -= c; a ^= (c>>3);  \
+  b -= c; b -= a; b ^= (a<<10); \
+  c -= a; c -= b; c ^= (b>>15); \
+}
+
+/* The golden ration: an arbitrary value */
+#define JHASH_GOLDEN_RATIO	0x9e3779b9
+
+/* The most generic version, hashes an arbitrary sequence
+ * of bytes.  No alignment or length assumptions are made about
+ * the input key.
+ */
+static inline u32 jhash(const void *key, u32 length, u32 initval)
+{
+	u32 a, b, c, len;
+	const u8 *k = key;
+
+	len = length;
+	a = b = JHASH_GOLDEN_RATIO;
+	c = initval;
+
+	while (len >= 12) {
+		a += (k[0] +((u32)k[1]<<8) +((u32)k[2]<<16) +((u32)k[3]<<24));
+		b += (k[4] +((u32)k[5]<<8) +((u32)k[6]<<16) +((u32)k[7]<<24));
+		c += (k[8] +((u32)k[9]<<8) +((u32)k[10]<<16)+((u32)k[11]<<24));
+
+		__jhash_mix(a,b,c);
+
+		k += 12;
+		len -= 12;
+	}
+
+	c += length;
+	switch (len) {
+	case 11: c += ((u32)k[10]<<24);
+	case 10: c += ((u32)k[9]<<16);
+	case 9 : c += ((u32)k[8]<<8);
+	case 8 : b += ((u32)k[7]<<24);
+	case 7 : b += ((u32)k[6]<<16);
+	case 6 : b += ((u32)k[5]<<8);
+	case 5 : b += k[4];
+	case 4 : a += ((u32)k[3]<<24);
+	case 3 : a += ((u32)k[2]<<16);
+	case 2 : a += ((u32)k[1]<<8);
+	case 1 : a += k[0];
+	};
+
+	__jhash_mix(a,b,c);
+
+	return c;
+}
+
+/* A special optimized version that handles 1 or more of u32s.
+ * The length parameter here is the number of u32s in the key.
+ */
+static inline u32 jhash2(u32 *k, u32 length, u32 initval)
+{
+	u32 a, b, c, len;
+
+	a = b = JHASH_GOLDEN_RATIO;
+	c = initval;
+	len = length;
+
+	while (len >= 3) {
+		a += k[0];
+		b += k[1];
+		c += k[2];
+		__jhash_mix(a, b, c);
+		k += 3; len -= 3;
+	}
+
+	c += length * 4;
+
+	switch (len) {
+	case 2 : b += k[1];
+	case 1 : a += k[0];
+	};
+
+	__jhash_mix(a,b,c);
+
+	return c;
+}
+
+
+/* A special ultra-optimized versions that knows they are hashing exactly
+ * 3, 2 or 1 word(s).
+ *
+ * NOTE: In partilar the "c += length; __jhash_mix(a,b,c);" normally
+ *       done at the end is not done here.
+ */
+static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
+{
+	a += JHASH_GOLDEN_RATIO;
+	b += JHASH_GOLDEN_RATIO;
+	c += initval;
+
+	__jhash_mix(a, b, c);
+
+	return c;
+}
+
+static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+	return jhash_3words(a, b, 0, initval);
+}
+
+static inline u32 jhash_1word(u32 a, u32 initval)
+{
+	return jhash_3words(a, 0, 0, initval);
+}
+
+static unsigned int get_randomness(int *state)
+{
+	return jhash_1word((*state)++, *state * 1103515243);
+}
+
+static char *random_path(int *state)
+{
+	unsigned int i;
+	char *ret = NULL;
+
+	if (get_randomness(state) % 20 == 0)
+		return talloc_strdup(NULL, "/");
+
+	for (i = 0; i < 1 || (get_randomness(state) % 2); i++) {
+		ret = talloc_asprintf_append(ret, "/%i", 
+					     get_randomness(state) % 15);
+	}
+	return ret;
+}
+
+static char *bool_to_errstring(bool result)
+{
+	if (result)
+		return talloc_strdup(NULL, "OK");
+
+	/* Real daemon can never return this. */
+	if (errno == ENOTDIR)
+		errno = ENOENT;
+	return talloc_asprintf(NULL, "FAILED:%s", strerror(errno));
+}
+
+static char *linearize_dir(char **dir, unsigned int *num)
+{
+	char *result = NULL;
+	unsigned int i;
+
+	if (!dir)
+		return bool_to_errstring(false);
+
+	if (!*num) {
+		free(dir);
+		return talloc_strdup(NULL, "");
+	}
+
+	sort_dir(dir, *num);
+	for (i = 0; i < *num; i++)
+		result = talloc_asprintf_append(result, "%s\n", dir[i]);
+	free(dir);
+	return result;
+}
+
+static char *linearize_read(char *read, unsigned int *size)
+{
+	char *ret;
+
+	if (!read)
+		return bool_to_errstring(false);
+
+	ret = talloc_asprintf(NULL, "%i:%.*s", *size, *size, read);
+	free(read);
+	return ret;
+}
+
+static char *linearize_perms(struct xs_permissions *perms, unsigned int *size)
+{
+	char *ret = NULL;
+	unsigned int i;
+
+	if (!perms)
+		return bool_to_errstring(false);
+
+	for (i = 0; i < *size; i++)
+		ret = talloc_asprintf_append(ret, "(%u %u)",
+					     perms[i].id, perms[i].perms);
+
+	free(perms);
+	return ret;
+}
+
+static int random_flags(int *state)
+{
+	switch (get_randomness(state) % 4) {
+	case 0:
+		return 0;
+	case 1:
+		return O_CREAT;
+	case 2:
+		return O_CREAT|O_EXCL;
+	default:
+		return get_randomness(state);
+	}
+}
+
+/* Do the next operation, return the results. */
+static char *do_next_op(struct ops *ops, void *h, int state, bool verbose)
+{
+	char *name;
+	unsigned int num;
+	char *ret;
+
+	if (verbose)
+		printf("State %i: ", state);
+
+	name = random_path(&state);
+	switch (get_randomness(&state) % 9) {
+	case 0:
+		if (verbose)
+			printf("DIR %s\n", name);
+		ret = linearize_dir(ops->dir(h, name, &num), &num);
+		break;
+	case 1:
+		if (verbose)
+			printf("READ %s\n", name);
+		ret = linearize_read(ops->read(h, name, &num), &num);
+		break;
+	case 2: {
+		int flags = random_flags(&state);
+		char *contents = talloc_asprintf(NULL, "%i",
+						 get_randomness(&state));
+		unsigned int len = get_randomness(&state)%(strlen(contents)+1);
+		if (verbose)
+			printf("WRITE %s %s %.*s\n", name,
+			       flags == O_CREAT ? "O_CREAT" 
+			       : flags == (O_CREAT|O_EXCL) ? "O_CREAT|O_EXCL"
+			       : flags == 0 ? "0" : "CRAPFLAGS",
+			       len, contents);
+		ret = bool_to_errstring(ops->write(h, name, contents, len,
+						   flags));
+		talloc_steal(ret, contents);
+		break;
+	}
+	case 3:
+		if (verbose)
+			printf("MKDIR %s\n", name);
+		ret = bool_to_errstring(ops->mkdir(h, name));
+		break;
+	case 4:
+		if (verbose)
+			printf("RM %s\n", name);
+		ret = bool_to_errstring(ops->rm(h, name));
+		break;
+	case 5:
+		if (verbose)
+			printf("GETPERMS %s\n", name);
+		ret = linearize_perms(ops->get_perms(h, name, &num),
+				      &num);
+		break;
+	case 6: {
+		unsigned int i, num = get_randomness(&state)%8;
+		struct xs_permissions perms[num];
+
+		if (verbose)
+			printf("SETPERMS %s: ", name);
+		for (i = 0; i < num; i++) {
+			perms[i].id = get_randomness(&state)%8;
+			perms[i].perms = get_randomness(&state)%4;
+			if (verbose)
+				printf("%i%c ", perms[i].id,
+				       perms[i].perms == XS_PERM_WRITE ? 'W'
+				       : perms[i].perms == XS_PERM_READ ? 'R'
+				       : perms[i].perms == 
+				       (XS_PERM_READ|XS_PERM_WRITE) ? 'B'
+				       : 'N');
+		}
+		if (verbose)
+			printf("\n");
+		ret = bool_to_errstring(ops->set_perms(h, name, perms,
+						       num));
+		break;
+	}
+	case 7: {
+		if (verbose)
+			printf("START %s\n", name);
+		ret = bool_to_errstring(ops->transaction_start(h, name));
+		if (streq(ret, "OK")) {
+			talloc_free(ret);
+			ret = talloc_asprintf(NULL, "OK:START-TRANSACT:%s",
+					      name);
+		}
+
+		break;
+	}
+	case 8: {
+		bool abort = (get_randomness(&state) % 2);
+
+		if (verbose)
+			printf("STOP %s\n", abort ? "ABORT" : "COMMIT");
+		ret = bool_to_errstring(ops->transaction_end(h, abort));
+		if (streq(ret, "OK")) {
+			talloc_free(ret);
+			ret = talloc_strdup(NULL, "OK:STOP-TRANSACT");
+		}
+		break;
+	}
+	default:
+		barf("Impossible randomness");
+	}
+
+	talloc_steal(ret, name);
+	return ret;
+}
+
+static int daemon_pid;
+
+static void cleanup_xs_ops(void)
+{
+	char *cmd;
+	if (daemon_pid) {
+		struct xs_handle *h;
+		h = xs_daemon_open();
+		if (h) {
+			if (xs_shutdown(h)) {
+				waitpid(daemon_pid, NULL, 0);
+				daemon_pid = 0;
+			}
+			xs_daemon_close(h);
+		}
+		if (daemon_pid) {
+			kill(daemon_pid, SIGTERM);
+			waitpid(daemon_pid, NULL, 0);
+		}
+	}
+	
+	cmd = talloc_asprintf(NULL, "rm -rf testsuite/tmp/*");
+	do_command(cmd);
+	talloc_free(cmd);
+}
+
+static void cleanup_file_ops(const char *dir)
+{
+	char *cmd;
+
+	cmd = talloc_asprintf(NULL, "rm -rf %s %s.transact", dir, dir);
+	do_command(cmd);
+	talloc_free(cmd);
+}
+
+static void cleanup(const char *dir)
+{
+	cleanup_xs_ops();
+	cleanup_file_ops(dir);
+}
+
+static void setup_file_ops(const char *dir)
+{
+	if (mkdir(dir, 0700) != 0)
+		barf_perror("Creating directory %s", dir);
+}
+
+static void setup_xs_ops(void)
+{
+	int fds[2];
+
+	/* Start daemon. */
+	pipe(fds);
+	if ((daemon_pid = fork())) {
+		/* Child writes PID when its ready: we wait for that. */
+		char buffer[20];
+		close(fds[1]);
+		if (read(fds[0], buffer, sizeof(buffer)) < 0)
+			barf("Failed to summon daemon");
+		close(fds[0]);
+	} else {
+		dup2(fds[1], STDOUT_FILENO);
+		close(fds[0]);
+#if 0
+		execlp("valgrind", "valgrind", "xenstored_test", "--output-pid",
+		       "--no-fork", NULL);
+#else
+		execlp("./xenstored_test", "xenstored_test", "--output-pid",
+		       "--no-fork", NULL);
+#endif
+		exit(1);
+	}
+}
+
+static void setup(const char *dir)
+{
+	setup_file_ops(dir);
+	setup_xs_ops();
+};
+
+struct simple_data
+{
+	unsigned int seed;
+	bool print_progress;
+	bool fast;
+	struct ops *ops;
+	const char *dir;
+};
+
+/* Just a random test.  Don't care about results, just that it doesn't
+ * go boom. */
+static unsigned int try_simple(const bool *trymap,
+			       unsigned int number,
+			       bool verbose,
+			       void *_data)
+{
+	unsigned int i, print;
+	void *h;
+	char *snapshot = NULL;
+	struct simple_data *data = _data;
+
+	if (data->ops == &xs_ops) {
+		cleanup_xs_ops();
+		setup_xs_ops();
+	} else {
+		cleanup_file_ops(data->dir);
+		setup_file_ops(data->dir);
+	}
+	h = data->ops->handle(data->dir);
+
+	print = number / 76;
+	if (!print)
+		print = 1;
+
+	for (i = 0; i < number; i++) {
+		char *ret;
+
+		if (data->print_progress) {
+			if (i % print == 0) {
+				printf(".");
+				fflush(stdout);
+			}
+		}
+
+		if (trymap && !trymap[i])
+			continue;
+
+		ret = do_next_op(data->ops, h, i + data->seed, verbose);
+		if (verbose)
+			printf("-> %.*s\n", strchr(ret, '\n') - ret, ret);
+		if (streq(ret, "FAILED:Bad file descriptor"))
+			goto out;
+		if (kill(daemon_pid, 0) != 0)
+			goto out;
+
+		if (!data->fast) {
+			if (strstarts(ret, "OK:START-TRANSACT:")) {
+				void *pre = data->ops->handle(data->dir);
+
+				snapshot = dump(data->ops, pre);
+				if (!snapshot)
+					goto out;
+				data->ops->close(pre);
+			} else if (streq(ret, "OK:STOP-TRANSACT")) {
+				talloc_free(snapshot);
+				snapshot = NULL;
+			}
+		}
+
+		talloc_free(ret);
+
+		if (snapshot) {
+			void *pre = data->ops->handle(data->dir);
+			char *contents;
+
+			contents = dump(data->ops, pre);
+			if (!contents)
+				goto out;
+
+			if (!streq(contents, snapshot))
+				goto out;
+
+			talloc_free(contents);
+			data->ops->close(pre);
+		}
+	}
+	if (data->print_progress)
+		printf("\n");
+
+out:
+	data->ops->close(h);	
+	return i;
+}
+
+/* Binary elimination: try eliminating all of them, then reduce. */
+static void reduce(bool *map,
+		   unsigned int number,
+		   unsigned int try_start, unsigned int try_num,
+		   unsigned int (*try)(const bool *map,
+				       unsigned int number,
+				       bool verbose,
+				       void *),
+		   void *data)
+{
+	bool newmap[number];
+
+	if (try_num == 0)
+		return;
+
+	/* Try skipping everything between start and end.  */
+	memcpy(newmap, map, sizeof(newmap));
+	memset(newmap + try_start, 0, try_num * sizeof(bool));
+
+	/* We want the *same* failure: must fail at "number-1". */
+	if (try(newmap, number, false, data) == number - 1) {
+		memset(map + try_start, 0, try_num * sizeof(bool));
+		return;
+	}
+
+	if (try_num == 1)
+		return;
+
+	/* Try each half... */
+	reduce(map, number, try_start, try_num/2, try, data);
+	reduce(map, number, try_start + try_num/2, try_num - try_num/2,
+	       try, data);
+}
+
+static void reduce_problem(unsigned int failed,
+			   unsigned int (*try)(const bool *map,
+					       unsigned int number,
+					       bool verbose,
+					       void *data),
+			   void *data)
+{
+	bool map[failed];
+
+	memset(map, 1, sizeof(map));
+	reduce(map, failed, 0, failed-1, try, data);
+
+	printf("Cut down:\n");
+	if (try(map, failed, true, data) != failed - 1) {
+		printf("Except, that didn't actually fail.  Bugger!");
+		exit(2);
+	}
+	exit(1);
+}
+
+/* Just a random test.  Don't care about results, just that it doesn't
+ * go boom. */
+static void simple_test(const char *dir,
+			unsigned int iters, unsigned int seed,
+			bool fast, bool verbose)
+{
+	struct simple_data data;
+	unsigned int try;
+
+	data.seed = seed;
+	data.print_progress = !verbose;
+	data.fast = fast;
+	data.ops = &xs_ops;
+	data.dir = dir;
+
+	try = try_simple(NULL, iters, verbose, &data);
+	if (try == iters) {
+		cleanup_xs_ops();
+		printf("Succeeded\n");
+		exit(0);
+	}
+	printf("Failed on iteration %u\n", try + 1);
+	data.print_progress = false;
+	reduce_problem(try + 1, try_simple, &data);
+}
+
+static bool ops_equal(struct ops *a, void *ah,
+		      struct ops *b, void *bh,
+		      const char *node,
+		      struct ops **fail)
+{
+	char **dira = NULL, **dirb = NULL;
+	char *dataa = NULL, *datab = NULL;
+	unsigned int i, numa, numb, lena, lenb;
+	struct xs_permissions *permsa = NULL, *permsb = NULL;
+	unsigned int numpermsa, numpermsb;
+	char *nodename;
+	bool ret = false;
+
+	/* FILE backend expects talloc'ed pointer. */
+	nodename = talloc_strdup(NULL, node);
+	permsa = a->get_perms(ah, nodename, &numpermsa);
+	if (!permsa) {
+		*fail = a;
+		goto out;
+	}
+	permsb = b->get_perms(bh, nodename, &numpermsb);
+	if (!permsb) {
+		*fail = b;
+		goto out;
+	}
+	if (numpermsa != numpermsb)
+		goto out;
+	for (i = 0; i < numpermsa; i++) {
+		if (permsa[i].perms != permsb[i].perms)
+			goto out;
+		if (permsa[i].id != permsb[i].id)
+			goto out;
+	}
+
+	/* Non-pure-directory nodes contain data. */
+	dataa = a->read(ah, nodename, &lena);
+	if (!dataa && errno != EISDIR) {
+		*fail = a;
+		goto out;
+	}
+	datab = b->read(bh, nodename, &lenb);
+	if (!datab && errno != EISDIR) {
+		*fail = b;
+		goto out;
+	}
+
+	if (dataa) {
+		if (!datab)
+			goto out;
+		if (lena != lenb)
+			goto out;
+
+		if (memcmp(dataa, datab, lena) != 0)
+			goto out;
+	} else
+		if (datab)
+			goto out;
+
+	/* Everything is a directory. */
+	dira = a->dir(ah, nodename, &numa);
+	if (!dira) {
+		*fail = a;
+		goto out;
+	}
+	dirb = b->dir(bh, nodename, &numb);
+	if (!dirb) {
+		*fail = b;
+		goto out;
+	}
+	if (numa != numb)
+		goto out;
+	sort_dir(dira, numa);
+	sort_dir(dirb, numb);
+	for (i = 0; i < numa; i++) {
+		char subnode[strlen(node) + 1 + strlen(dira[i]) + 1];
+
+		if (!streq(dira[i], dirb[i]))
+			goto out;
+
+		strcpy(subnode, node);
+		if (!streq(node, "/"))
+			strcat(subnode, "/");
+		strcat(subnode, dira[i]);
+		if (!ops_equal(a, ah, b, bh, subnode, fail))
+			goto out;
+	}
+
+	ret = true;
+out:
+	free(permsa);
+	free(permsb);
+	free(dataa);
+	free(datab);
+	free(dira);
+	free(dirb);
+	talloc_free(nodename);
+	return ret;
+}
+
+struct diff_data
+{
+	unsigned int seed;
+	bool print_progress;
+	bool fast;
+	const char *dir;
+};
+
+/* Differential: try both file and xs backend, watch for differences. */
+static unsigned int try_diff(const bool *trymap,
+			     unsigned int number,
+			     bool verbose,
+			     void *_data)
+{
+	void *fileh, *xsh;
+	char *transact = NULL;
+	struct ops *fail;
+	struct diff_data *data = _data;
+	unsigned int i, print;
+
+	cleanup(data->dir);
+	setup(data->dir);
+
+	fileh = file_handle(data->dir);
+	xsh = xs_handle(data->dir);
+
+	print = number / 76;
+	if (!print)
+		print = 1;
+
+	for (i = 0; i < number; i++) {
+		char *file, *xs;
+
+		if (data->print_progress) {
+			if (i % print == 0) {
+				printf(".");
+				fflush(stdout);
+			}
+		}
+		if (trymap && !trymap[i])
+			continue;
+
+		if (verbose)
+			printf("FILE: ");
+
+		file = do_next_op(&file_ops, fileh, i+data->seed, verbose);
+		if (verbose)
+			printf("-> %.*s\n", strchr(file, '/') - file, file);
+		
+		if (verbose)
+			printf("XS: ");
+		xs = do_next_op(&xs_ops, xsh, i+data->seed, verbose);
+		if (verbose)
+			printf("-> %.*s\n", strchr(xs, '/') - xs, xs);
+
+		if (!streq(file, xs))
+			goto out;
+
+		if (strstarts(file, "OK:START-TRANSACT:"))
+			transact = talloc_strdup(NULL,
+						 file +
+						 strlen("OK:START-TRANSACT:"));
+		else if (streq(file, "OK:STOP-TRANSACT")) {
+			talloc_free(transact);
+			transact = NULL;
+		}
+
+		talloc_free(file);
+		talloc_free(xs);
+
+		if (data->fast)
+			continue;
+
+		fail = NULL;
+		if (!ops_equal(&xs_ops, xsh, &file_ops, fileh, "/", &fail)) {
+			if (fail)
+				barf("%s failed during test\n", fail->name);
+			if (verbose)
+				printf("Trees differ:\nXS:%s\nFILE%s\n",
+				       dump(&xs_ops, xsh),
+				       dump(&file_ops, fileh));
+			goto out;
+		}
+
+		if (transact) {
+			void *fileh_pre = file_handle(data->dir);
+			void *xsh_pre = xs_handle(data->dir);
+
+			fail = NULL;
+			if (!ops_equal(&xs_ops, xsh_pre, &file_ops, fileh_pre,
+				       transact, &fail)) {
+				if (fail)
+					barf("%s failed during transact\n",
+					     fail->name);
+
+				xs_daemon_close(xsh_pre);
+				talloc_free(fileh_pre);
+				goto out;
+			}
+			xs_daemon_close(xsh_pre);
+			talloc_free(fileh_pre);
+		}
+	}
+	if (data->print_progress)
+		printf("\n");
+
+	fail = NULL;
+	if (data->fast)
+		if (!ops_equal(&xs_ops, xsh, &file_ops, fileh, "/", &fail))
+			barf("Final result not the same: try without --fast");
+out:
+	file_ops.close(fileh);	
+	xs_ops.close(xsh);	
+	return i;
+}
+
+/* Differential random test: compare results against file backend. */
+static void diff_test(const char *dir,
+		      unsigned int iters, unsigned int seed, bool fast, 
+		      bool verbose)
+{
+	struct diff_data data;
+	unsigned int try;
+
+	data.seed = seed;
+	data.print_progress = !verbose;
+	data.fast = fast;
+	data.dir = dir;
+
+	try = try_diff(NULL, iters, verbose, &data);
+	if (try == iters) {
+		cleanup_xs_ops();
+		printf("Succeeded\n");
+		exit(0);
+	}
+	printf("Failed on iteration %u\n", try + 1);
+	data.print_progress = false;
+	reduce_problem(try + 1, try_diff, &data);
+}
+
+struct fail_data
+{
+	unsigned int seed;
+	bool print_progress;
+	const char *dir;
+};
+
+/* Try xs with inserted failures: every op should either succeed or fail. */
+static unsigned int try_fail(const bool *trymap,
+			      unsigned int number,
+			      bool verbose,
+			      void *_data)
+{
+	unsigned int i, print, tried = 0, aborted = 0;
+	struct fail_data *data = _data;
+	struct xs_handle *tmpxsh;
+	struct file_ops_info *tmpfileh;
+	void *fileh, *xsh;
+	struct ops *fail;
+	char seed[20];
+
+	/* Make sure failures off to shut down. */
+	if (daemon_pid)
+		kill(daemon_pid, SIGUSR1);
+	cleanup(data->dir);
+	setup(data->dir);
+
+	fileh = file_handle(data->dir);
+	xsh = xs_handle(data->dir);
+
+	sprintf(seed, "%i", data->seed);
+	free(xs_debug_command(xsh, "failtest", seed, strlen(seed)+1));
+
+	print = number / 76;
+	if (!print)
+		print = 1;
+
+	for (i = 0; i < number; i++) {
+		unsigned int limit, failed;
+		char *ret;
+
+		/* A few times we fail due to other end OOM. */
+		limit = 0;
+		while (!xsh) {
+			xsh = xs_handle(data->dir);
+			if (!xsh && errno == ECONNREFUSED) {
+				if (verbose)
+					printf("Daemon refused connection\n");
+				goto out;
+			}
+			if (!xsh && limit++ == 5) {
+				printf("Daemon failed conn 5 times\n");
+				goto out;
+			}
+		}
+
+		if (data->print_progress) {
+			if (i % print == 0) {
+				printf(".");
+				fflush(stdout);
+			}
+		}
+		if (trymap && !trymap[i])
+			continue;
+
+		if (verbose)
+			printf("(%i) ", i);
+		ret = do_next_op(&xs_ops, xsh, i + data->seed, verbose);
+		if (streq(ret, "FAILED:Connection reset by peer")
+		    || streq(ret, "FAILED:Bad file descriptor")
+		    || streq(ret, "FAILED:Broken pipe")) {
+			xs_close(xsh);
+			xsh = NULL;
+			failed = 2;
+		} else if (strstarts(ret, "OK"))
+			failed = 0;
+		else
+			failed = 1;
+
+		tried++;
+		if (xsh)
+			aborted++;
+
+		if (verbose)
+			printf("-> %.*s\n", strchr(ret, '\n') - ret, ret);
+
+		talloc_free(ret);
+
+		/* Turn off failures using signal. */
+		if (kill(daemon_pid, SIGUSR1) != 0) {
+			if (verbose)
+				printf("Failed to signal daemon\n");
+			goto out;
+		}
+
+		if (failed == 0) {
+			/* Succeeded?  Do same thing to file backend
+			 * to compare */
+		try_applying:
+			ret = do_next_op(&file_ops, fileh, i + data->seed,
+					 false);
+			if (!strstarts(ret, "OK")) {
+				if (!verbose)
+					printf("File op failed on %i\n",
+					       i + data->seed);
+				talloc_free(ret);
+				goto out;
+			}
+			talloc_free(ret);
+		}
+
+		tmpxsh = xs_handle(data->dir);
+		if (!tmpxsh) {
+			if (verbose)
+				printf("Failed to open signalled daemon");
+			goto out;
+		}
+		tmpfileh = file_handle(data->dir);
+
+		fail = NULL;
+		if (!ops_equal(&xs_ops, tmpxsh, &file_ops, tmpfileh, "/",
+			       &fail)) {
+			xs_close(tmpxsh);
+			file_close(tmpfileh);
+			if (fail) {
+				if (verbose)
+					printf("%s failed\n", fail->name);
+				goto out;
+			}
+			/* Maybe op succeeded: try comparing after local op? */
+			if (failed == 2) {
+				failed = 0;
+				if (verbose)
+					printf("(Looks like it succeeded)\n");
+				goto try_applying;
+			}
+			if (verbose)
+				printf("Two backends not equal\n");
+			goto out;
+		}
+
+		/* If we lost the xs handle, that ended the transaction */
+		if (!xsh)
+			file_transaction_end(fileh, true);
+
+		/* Turn failures back on. */
+		free(xs_debug_command(tmpxsh, "failtest",  NULL, 0));
+		xs_close(tmpxsh);
+		file_close(tmpfileh);
+	}
+
+	printf("Total %u of %u not aborted\n", tried - aborted, tried);
+out:
+	if (xsh)
+		xs_close(xsh);
+	return i;
+}
+
+static void fail_test(const char *dir,
+		      unsigned int iters, unsigned int seed,
+		      bool fast __attribute__((unused)), bool verbose)
+{
+	struct fail_data data;
+	unsigned int try;
+
+	data.seed = seed;
+	data.print_progress = !verbose;
+	data.dir = dir;
+
+	try = try_fail(NULL, iters, verbose, &data);
+	if (try == iters) {
+		cleanup_xs_ops();
+		printf("Succeeded\n");
+		exit(0);
+	}
+	printf("Failed on iteration %u\n", try + 1);
+	fflush(stdout);
+	data.print_progress = false;
+	reduce_problem(try + 1, try_fail, &data);
+}
+
+int main(int argc, char *argv[])
+{
+	bool verbose = false;
+	bool simple = false;
+	bool fast = false;
+	bool fail = false;
+
+	if (argv[1] && streq(argv[1], "--fail")) {
+		fail = true;
+		argv++;
+		argc--;
+	}
+
+	if (argv[1] && streq(argv[1], "--simple")) {
+		simple = true;
+		argv++;
+		argc--;
+	}
+
+	if (argv[1] && streq(argv[1], "--fast")) {
+		fast = true;
+		argv++;
+		argc--;
+	}
+
+	if (argv[1] && streq(argv[1], "--verbose")) {
+		verbose = true;
+		argv++;
+		argc--;
+	}
+
+	if (argc != 4)
+		barf("Usage: xs_random [--fail|--simple] [--fast] [--verbose] <directory> <iterations> <seed>");
+
+	talloc_enable_null_tracking();
+
+	if (fail)
+		fail_test(argv[1], atoi(argv[2]), atoi(argv[3]), fast, verbose);
+	else if (simple)
+		simple_test(argv[1], atoi(argv[2]), atoi(argv[3]), fast, verbose);
+	else
+		diff_test(argv[1],  atoi(argv[2]), atoi(argv[3]), fast, verbose);
+	exit(2);
+}
diff --git a/tools/xenstore/xs_stress.c b/tools/xenstore/xs_stress.c
new file mode 100644
index 0000000000..9c480b1553
--- /dev/null
+++ b/tools/xenstore/xs_stress.c
@@ -0,0 +1,207 @@
+/* Stress test for Xen Store: multiple people hammering transactions */
+#include "xs.h"
+#include "utils.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#define NUM_HANDLES 2
+#define DIR_FANOUT 3
+#define DIR_DEPTH 3
+
+/* How often to print progress */
+static int print;
+
+/* Layout looks like /<num>/<num>/count. */
+static void work(unsigned int cycles, unsigned int childnum)
+{
+	unsigned int i;
+	struct xs_handle *handles[NUM_HANDLES];
+	char id;
+
+	if (childnum < 10)
+		id = '0' + childnum;
+	else
+		id = 'A' + childnum - 10;
+
+	for (i = 0; i < NUM_HANDLES; i++) {
+		handles[i] = xs_daemon_open();
+		if (!handles[i])
+			barf_perror("Opening handle %i", i);
+	}
+
+	srandom(childnum);
+	for (i = 0; i < cycles; i++) {
+		unsigned int lockdepth, j, len;
+		char file[100] = "", lockdir[100];
+		char *contents, tmp[100];
+		struct xs_handle *h = handles[random() % NUM_HANDLES];
+
+		lockdepth = random() % DIR_DEPTH;
+		for (j = 0; j < DIR_DEPTH; j++) {
+			if (j == lockdepth)
+				strcpy(lockdir, file);
+			sprintf(file + strlen(file), "/%li",
+				random()%DIR_FANOUT);
+		}
+		if (streq(lockdir, ""))
+			strcpy(lockdir, "/");
+		
+		if (!xs_transaction_start(h, lockdir))
+			barf_perror("%i: starting transaction %i on %s",
+				    childnum, i, lockdir);
+
+		sprintf(file + strlen(file), "/count");
+		contents = xs_read(h, file, &len);
+		if (!contents)
+			barf_perror("%i: can't read %s iter %i",
+				    childnum, file, i);
+		sprintf(tmp, "%i", atoi(contents) + 1);
+		if (!xs_write(h, file, tmp, strlen(tmp)+1, 0))
+			barf_perror("%i: can't write %s iter %i",
+				    childnum, file, i);
+
+		/* Abandon 1 in 10 */
+		if (random() % 10 == 0) {
+			if (!xs_transaction_end(h, true))
+				barf_perror("%i: can't abort transact %s",
+					    childnum, lockdir);
+			i--;
+		} else {
+			if (!xs_transaction_end(h, false))
+				barf_perror("%i: can't commit transact %s",
+					    childnum, lockdir);
+
+			/* Offset when we print . so kids don't all
+			 * print at once. */
+			if ((i + print/(childnum+1)) % print == 0)
+				write(STDOUT_FILENO, &id, 1);
+		}
+	}
+}
+
+static void create_dirs(struct xs_handle *h, const char *base, int togo)
+{
+	unsigned int i;
+	char filename[100];
+
+	if (togo == 0) {
+		sprintf(filename, "%s/count", base);
+		if (!xs_write(h, filename, "0", 2, O_EXCL|O_CREAT))
+			barf_perror("Writing to %s", filename);
+		return;
+	}
+
+	for (i = 0; i < DIR_FANOUT; i++) {
+		sprintf(filename, "%s/%i", base, i);
+		if (!xs_mkdir(h, filename))
+			barf_perror("xs_mkdir %s", filename);
+		create_dirs(h, filename, togo-1);
+	}
+}
+
+static unsigned int add_count(struct xs_handle *h, const char *base, int togo)
+{
+	unsigned int i, count;
+	char filename[100];
+
+	if (togo == 0) {
+		char *answer;
+		unsigned int len;
+
+		sprintf(filename, "%s/count", base);
+		answer = xs_read(h, filename, &len);
+		if (!answer)
+			barf_perror("Reading %s", filename);
+		count = atoi(answer);
+		free(answer);
+		return count;
+	}
+
+	count = 0;
+	for (i = 0; i < DIR_FANOUT; i++) {
+		sprintf(filename, "%s/%i", base, i);
+		count += add_count(h, filename, togo-1);
+	}
+	return count;
+}
+
+static void setup(void)
+{
+	struct xs_handle *h;
+
+	/* Do setup. */
+	h = xs_daemon_open();
+	if (!h)
+		barf_perror("Contacting daemon");
+	create_dirs(h, "", DIR_DEPTH);
+	xs_daemon_close(h);
+}
+
+static unsigned int tally_counts(void)
+{
+	struct xs_handle *h;
+	unsigned int ret;
+	
+	h = xs_daemon_open();
+	if (!h)
+		barf_perror("Contacting daemon");
+
+	ret = add_count(h, "", DIR_DEPTH);
+	xs_daemon_close(h);
+	return ret;
+}	
+
+int main(int argc, char *argv[])
+{
+	unsigned int i;
+	bool failed = false;
+	int kids[10];
+
+	if (argc != 2)
+		barf("Usage: xs_stress <iterations>");
+
+	printf("Setting up directories...\n");
+	setup();
+
+	print = atoi(argv[1]) / 76;
+	if (!print)
+		print = 1;
+
+	printf("Running %i children...\n", ARRAY_SIZE(kids));
+	for (i = 0; i < ARRAY_SIZE(kids); i++) {
+		kids[i] = fork();
+		if (kids[i] == -1)
+			barf_perror("fork");
+		if (kids[i] == 0) {
+			work(atoi(argv[1]) / ARRAY_SIZE(kids), i);
+			exit(0);
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(kids); i++) {
+		int status;
+		if (waitpid(kids[i], &status, 0) == -1)
+			barf_perror("waitpid");
+		if (!WIFEXITED(status))
+			barf("Kid %i died via signal %i\n",
+			     i, WTERMSIG(status));
+		if (WEXITSTATUS(status) != 0) {
+			printf("Child %i exited %i\n", i, WEXITSTATUS(status));
+			failed = true;
+		}
+	}
+	if (failed)
+		exit(1);
+
+	printf("\nCounting results...\n");
+	i = tally_counts();
+	if (i != (unsigned)atoi(argv[1]))
+		barf("Total counts %i not %s", i, atoi(argv[1]));
+	printf("Success!\n");
+	exit(0);
+}
diff --git a/tools/xenstore/xs_test.c b/tools/xenstore/xs_test.c
new file mode 100644
index 0000000000..f1e66cbe28
--- /dev/null
+++ b/tools/xenstore/xs_test.c
@@ -0,0 +1,647 @@
+/* 
+    Xen Store Daemon Test tool
+    Copyright (C) 2005 Rusty Russell IBM Corporation
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include "utils.h"
+#include "xs_lib.h"
+
+#define XSTEST
+
+static struct xs_handle *handles[10] = { NULL };
+
+struct ringbuf_head
+{
+	uint32_t write; /* Next place to write to */
+	uint32_t read; /* Next place to read from */
+	uint8_t flags;
+	char buf[0];
+} __attribute__((packed));
+
+static struct ringbuf_head *out, *in;
+static unsigned int ringbuf_datasize;
+static int daemon_pid;
+
+/* FIXME: Mark connection as broken (close it?) when this happens. */
+static bool check_buffer(const struct ringbuf_head *h)
+{
+	return (h->write < ringbuf_datasize && h->read < ringbuf_datasize);
+}
+
+/* We can't fill last byte: would look like empty buffer. */
+static void *get_output_chunk(const struct ringbuf_head *h,
+			      void *buf, uint32_t *len)
+{
+	uint32_t read_mark;
+
+	if (h->read == 0)
+		read_mark = ringbuf_datasize - 1;
+	else
+		read_mark = h->read - 1;
+
+	/* Here to the end of buffer, unless they haven't read some out. */
+	*len = ringbuf_datasize - h->write;
+	if (read_mark >= h->write)
+		*len = read_mark - h->write;
+	return buf + h->write;
+}
+
+static const void *get_input_chunk(const struct ringbuf_head *h,
+				   const void *buf, uint32_t *len)
+{
+	/* Here to the end of buffer, unless they haven't written some. */
+	*len = ringbuf_datasize - h->read;
+	if (h->write >= h->read)
+		*len = h->write - h->read;
+	return buf + h->read;
+}
+
+static void update_output_chunk(struct ringbuf_head *h, uint32_t len)
+{
+	h->write += len;
+	if (h->write == ringbuf_datasize)
+		h->write = 0;
+}
+
+static void update_input_chunk(struct ringbuf_head *h, uint32_t len)
+{
+	h->read += len;
+	if (h->read == ringbuf_datasize)
+		h->read = 0;
+}
+
+/* FIXME: We spin, and we're sloppy. */
+static bool read_all_shmem(int fd __attribute__((unused)),
+			   void *data, unsigned int len)
+{
+	unsigned int avail;
+
+	if (!check_buffer(in))
+		barf("Corrupt buffer");
+
+	while (len) {
+		const void *src = get_input_chunk(in, in->buf, &avail);
+		if (avail > len)
+			avail = len;
+		memcpy(data, src, avail);
+		data += avail;
+		len -= avail;
+		update_input_chunk(in, avail);
+	}
+
+	/* Tell other end we read something. */
+	kill(daemon_pid, SIGUSR2);
+	return true;
+}
+
+static bool write_all_shmem(int fd __attribute__((unused)),
+			    const void *data, unsigned int len)
+{
+	uint32_t avail;
+
+	if (!check_buffer(out))
+		barf("Corrupt buffer");
+
+	while (len) {
+		void *dst = get_output_chunk(out, out->buf, &avail);
+		if (avail > len)
+			avail = len;
+		memcpy(dst, data, avail);
+		data += avail;
+		len -= avail;
+		update_output_chunk(out, avail);
+	}
+
+	/* Tell other end we wrote something. */
+	kill(daemon_pid, SIGUSR2);
+	return true;
+}
+
+static bool read_all(int fd, void *data, unsigned int len);
+static bool read_all_choice(int fd, void *data, unsigned int len)
+{
+	if (fd == -2)
+		return read_all_shmem(fd, data, len);
+	return read_all(fd, data, len);
+}
+
+static bool write_all_choice(int fd, const void *data, unsigned int len)
+{
+	if (fd == -2)
+		return write_all_shmem(fd, data, len);
+	return write_all(fd, data, len);
+}
+
+/* We want access to internal functions. */
+#include "xs.c"
+
+static void __attribute__((noreturn)) usage(void)
+{
+	barf("Usage:\n"
+	     "       xs_test [--readonly] [--notimeout]\n"
+	     "Reads commands from stdin, one per line:"
+	     "  dir <path>\n"
+	     "  read <path>\n"
+	     "  write <path> <flags> <value>...\n"
+	     "  setid <id>\n"
+	     "  mkdir <path>\n"
+	     "  rm <path>\n"
+	     "  getperm <path>\n"
+	     "  setperm <path> <id> <flags> ...\n"
+	     "  shutdown\n"
+	     "  watch <path> <prio>\n"
+	     "  waitwatch\n"
+	     "  ackwatch\n"
+	     "  unwatch <path>\n"
+	     "  close\n"
+	     "  start <node>\n"
+	     "  abort\n"
+	     "  introduce <domid> <mfn> <eventchn>\n"
+	     "  commit\n"
+	     "  sleep <seconds>\n"
+	     "  dump\n");
+}
+
+static char *arg(char *line, unsigned int num)
+{
+	static char *args[10];
+	unsigned int i, len = 0;
+
+	for (i = 0; i <= num; i++) {
+		line += len;
+		line += strspn(line, " \t\n");
+		len = strcspn(line, " \t\n");
+		if (!len)
+			barf("Can't get arg %u", num);
+	}
+
+	free(args[num]);
+	args[num] = malloc(len + 1);
+	memcpy(args[num], line, len);
+	args[num][len] = '\0';
+	return args[num];
+}
+
+static char *command;
+static void __attribute__((noreturn)) failed(int handle)
+{
+	if (handle)
+		barf_perror("%i: %s", handle, command);
+	barf_perror("%s", command);
+}
+
+static void do_dir(unsigned int handle, char *path)
+{
+	char **entries;
+	unsigned int i, num;
+
+	entries = xs_directory(handles[handle], path, &num);
+	if (!entries)
+		failed(handle);
+
+	for (i = 0; i < num; i++)
+		if (handle)
+			printf("%i:%s\n", handle, entries[i]);
+		else
+			printf("%s\n", entries[i]);
+	free(entries);
+}
+
+static void do_read(unsigned int handle, char *path)
+{
+	char *value;
+	unsigned int len;
+
+	value = xs_read(handles[handle], path, &len);
+	if (!value)
+		failed(handle);
+
+	if (handle)
+		printf("%i:%.*s\n", handle, len, value);
+	else
+		printf("%.*s\n", len, value);
+}
+
+static void do_write(unsigned int handle, char *path, char *flags, char *data)
+{
+	int f;
+
+	if (streq(flags, "none"))
+		f = 0;
+	else if (streq(flags, "create"))
+		f = O_CREAT;
+	else if (streq(flags, "excl"))
+		f = O_CREAT | O_EXCL;
+	else if (streq(flags, "crap"))
+		f = 100;
+	else
+		barf("write flags 'none', 'create' or 'excl' only");
+
+	if (!xs_write(handles[handle], path, data, strlen(data)+1, f))
+		failed(handle);
+}
+
+static void do_setid(unsigned int handle, char *id)
+{
+	if (!xs_bool(xs_debug_command(handles[handle], "setid", id,
+				      strlen(id)+1)))
+		failed(handle);
+}
+
+static void do_mkdir(unsigned int handle, char *path)
+{
+	if (!xs_mkdir(handles[handle], path))
+		failed(handle);
+}
+
+static void do_rm(unsigned int handle, char *path)
+{
+	if (!xs_rm(handles[handle], path))
+		failed(handle);
+}
+
+static void do_getperm(unsigned int handle, char *path)
+{
+	unsigned int i, num;
+	struct xs_permissions *perms;
+
+	perms = xs_get_permissions(handles[handle], path, &num);
+	if (!perms)
+		failed(handle);
+
+	for (i = 0; i < num; i++) {
+		char *permstring;
+
+		switch (perms[i].perms) {
+		case XS_PERM_NONE:
+			permstring = "NONE";
+			break;
+		case XS_PERM_WRITE:
+			permstring = "WRITE";
+			break;
+		case XS_PERM_READ:
+			permstring = "READ";
+			break;
+		case XS_PERM_READ|XS_PERM_WRITE:
+			permstring = "READ/WRITE";
+			break;
+		default:
+			barf("bad perm value %i", perms[i].perms);
+		}
+
+		if (handle)
+			printf("%i:%i %s\n", handle, perms[i].id, permstring);
+		else
+			printf("%i %s\n", perms[i].id, permstring);
+	}
+	free(perms);
+}
+
+static void do_setperm(unsigned int handle, char *path, char *line)
+{
+	unsigned int i;
+	struct xs_permissions perms[100];
+
+	strtok(line, " \t\n");
+	strtok(NULL, " \t\n");
+	for (i = 0; ; i++) {
+		char *arg = strtok(NULL, " \t\n");
+		if (!arg)
+			break;
+		perms[i].id = atoi(arg);
+		arg = strtok(NULL, " \t\n");
+		if (!arg)
+			break;
+		if (streq(arg, "WRITE"))
+			perms[i].perms = XS_PERM_WRITE;
+		else if (streq(arg, "READ"))
+			perms[i].perms = XS_PERM_READ;
+		else if (streq(arg, "READ/WRITE"))
+			perms[i].perms = XS_PERM_READ|XS_PERM_WRITE;
+		else if (streq(arg, "NONE"))
+			perms[i].perms = XS_PERM_NONE;
+		else
+			barf("bad flags %s\n", arg);
+	}
+
+	if (!xs_set_permissions(handles[handle], path, perms, i))
+		failed(handle);
+}
+
+static void do_shutdown(unsigned int handle)
+{
+	if (!xs_shutdown(handles[handle]))
+		failed(handle);
+}
+
+static void do_watch(unsigned int handle, const char *node, const char *pri)
+{
+	if (!xs_watch(handles[handle], node, atoi(pri)))
+		failed(handle);
+}
+
+static void do_waitwatch(unsigned int handle)
+{
+	char *node;
+
+	node = xs_read_watch(handles[handle]);
+	if (!node)
+		failed(handle);
+
+	if (handle)
+		printf("%i:%s\n", handle, node);
+	else
+		printf("%s\n", node);
+	free(node);
+}
+
+static void do_ackwatch(unsigned int handle)
+{
+	if (!xs_acknowledge_watch(handles[handle]))
+		failed(handle);
+}
+
+static void do_unwatch(unsigned int handle, const char *node)
+{
+	if (!xs_unwatch(handles[handle], node))
+		failed(handle);
+}
+
+static void do_start(unsigned int handle, const char *node)
+{
+	if (!xs_transaction_start(handles[handle], node))
+		failed(handle);
+}
+
+static void do_end(unsigned int handle, bool abort)
+{
+	if (!xs_transaction_end(handles[handle], abort))
+		failed(handle);
+}
+
+static void do_introduce(unsigned int handle,
+			 const char *domid,
+			 const char *mfn,
+			 const char *eventchn,
+			 const char *path)
+{
+	unsigned int i;
+	int fd;
+
+	/* We poll, so ignore signal */
+	signal(SIGUSR2, SIG_IGN);
+	for (i = 0; i < ARRAY_SIZE(handles); i++)
+		if (!handles[i])
+			break;
+
+	fd = open("/tmp/xcmap", O_RDWR);
+	/* Set in and out pointers. */
+	out = mmap(NULL, getpagesize(), PROT_WRITE|PROT_READ, MAP_SHARED,fd,0);
+	if (out == MAP_FAILED)
+		barf_perror("Failed to map /tmp/xcmap page");
+	in = (void *)out + getpagesize() / 2;
+	close(fd);
+
+	/* Tell them the event channel and our PID. */
+	*(int *)((void *)out + 32) = getpid();
+	*(u16 *)((void *)out + 36) = atoi(eventchn);
+
+	/* Create new handle. */
+	handles[i] = new(struct xs_handle);
+	handles[i]->fd = -2;
+
+	if (!xs_introduce_domain(handles[handle], atoi(domid),
+				 atol(mfn), atoi(eventchn), path))
+		failed(handle);
+	printf("handle is %i\n", i);
+
+	/* Read in daemon pid. */
+	daemon_pid = *(int *)((void *)out + 32);
+}
+
+static void do_release(unsigned int handle, const char *domid)
+{
+	if (!xs_release_domain(handles[handle], atoi(domid)))
+		failed(handle);
+}
+
+static int strptrcmp(const void *a, const void *b)
+{
+	return strcmp(*(char **)a, *(char **)b);
+}
+
+static void sort_dir(char **dir, unsigned int num)
+{
+	qsort(dir, num, sizeof(char *), strptrcmp);
+}
+
+static void dump_dir(unsigned int handle,
+		     const char *node,
+		     char **dir,
+		     unsigned int numdirs,
+		     unsigned int depth)
+{
+	unsigned int i;
+	char spacing[depth+1];
+
+	memset(spacing, ' ', depth);
+	spacing[depth] = '\0';
+
+	sort_dir(dir, numdirs);
+
+	for (i = 0; i < numdirs; i++) {
+		struct xs_permissions *perms;
+		unsigned int j, numperms;
+		unsigned int len;
+		char *contents;
+		unsigned int subnum;
+		char **subdirs;
+		char subnode[strlen(node) + 1 + strlen(dir[i]) + 1];
+
+		sprintf(subnode, "%s/%s", node, dir[i]);
+
+		perms = xs_get_permissions(handles[handle], subnode,&numperms);
+		if (!perms)
+			failed(handle);
+
+		printf("%s%s: ", spacing, dir[i]);
+		for (j = 0; j < numperms; j++) {
+			char buffer[100];
+			if (!perm_to_string(&perms[j], buffer))
+				barf("perm to string");
+			printf("%s ", buffer);
+		}
+		free(perms);
+		printf("\n");
+
+		/* Even directories can have contents. */
+		contents = xs_read(handles[handle], subnode, &len);
+		if (!contents) {
+			if (errno != EISDIR)
+				failed(handle);
+		} else {
+			printf(" %s(%.*s)\n", spacing, len, contents);
+			free(contents);
+		}			
+
+		/* Every node is a directory. */
+		subdirs = xs_directory(handles[handle], subnode, &subnum);
+		if (!subdirs)
+			failed(handle);
+		dump_dir(handle, subnode, subdirs, subnum, depth+1);
+		free(subdirs);
+	}
+}
+
+static void dump(int handle)
+{
+	char **subdirs;
+	unsigned int subnum;
+
+	subdirs = xs_directory(handles[handle], "/", &subnum);
+	if (!subdirs)
+		failed(handle);
+
+	dump_dir(handle, "", subdirs, subnum, 0);
+	free(subdirs);
+}
+
+int main(int argc, char *argv[])
+{
+	char line[1024];
+	bool readonly = false, timeout = true;
+	int handle;
+
+	static void alarmed(int sig __attribute__((unused)))
+	{
+		if (handle) {
+			char handlename[10];
+			sprintf(handlename, "%u:", handle);
+			write(STDOUT_FILENO, handlename, strlen(handlename));
+		}
+		write(STDOUT_FILENO, command, strlen(command));
+		write(STDOUT_FILENO, " timeout\n", strlen(" timeout\n"));
+		exit(1);
+	}
+
+	if (argc > 1 && streq(argv[1], "--readonly")) {
+		readonly = true;
+		argc--;
+		argv++;
+	}
+
+	if (argc > 1 && streq(argv[1], "--notimeout")) {
+		timeout = false;
+		argc--;
+		argv++;
+	}
+
+	if (argc != 1)
+		usage();
+
+	/* The size of the ringbuffer: half a page minus head structure. */
+	ringbuf_datasize = getpagesize() / 2 - sizeof(struct ringbuf_head);
+
+	signal(SIGALRM, alarmed);
+	while (fgets(line, sizeof(line), stdin)) {
+		char *endp;
+
+		if (strspn(line, " \n") == strlen(line))
+			continue;
+		if (strstarts(line, "#"))
+			continue;
+
+		handle = strtoul(line, &endp, 10);
+		if (endp != line)
+			memmove(line, endp+1, strlen(endp));
+		else
+			handle = 0;
+
+		if (!handles[handle]) {
+			if (readonly)
+				handles[handle] = xs_daemon_open_readonly();
+			else
+				handles[handle] = xs_daemon_open();
+			if (!handles[handle])
+				barf_perror("Opening connection to daemon");
+		}
+		command = arg(line, 0);
+
+		if (timeout)
+			alarm(5);
+		if (streq(command, "dir"))
+			do_dir(handle, arg(line, 1));
+		else if (streq(command, "read"))
+			do_read(handle, arg(line, 1));
+		else if (streq(command, "write"))
+			do_write(handle,
+				 arg(line, 1), arg(line, 2), arg(line, 3));
+		else if (streq(command, "setid"))
+			do_setid(handle, arg(line, 1));
+		else if (streq(command, "mkdir"))
+			do_mkdir(handle, arg(line, 1));
+		else if (streq(command, "rm"))
+			do_rm(handle, arg(line, 1));
+		else if (streq(command, "getperm"))
+			do_getperm(handle, arg(line, 1));
+		else if (streq(command, "setperm"))
+			do_setperm(handle, arg(line, 1), line);
+		else if (streq(command, "shutdown"))
+			do_shutdown(handle);
+		else if (streq(command, "watch"))
+			do_watch(handle, arg(line, 1), arg(line, 2));
+		else if (streq(command, "waitwatch"))
+			do_waitwatch(handle);
+		else if (streq(command, "ackwatch"))
+			do_ackwatch(handle);
+		else if (streq(command, "unwatch"))
+			do_unwatch(handle, arg(line, 1));
+		else if (streq(command, "close")) {
+			xs_daemon_close(handles[handle]);
+			handles[handle] = NULL;
+		} else if (streq(command, "start"))
+			do_start(handle, arg(line, 1));
+		else if (streq(command, "commit"))
+			do_end(handle, false);
+		else if (streq(command, "abort"))
+			do_end(handle, true);
+		else if (streq(command, "introduce"))
+			do_introduce(handle, arg(line, 1), arg(line, 2),
+				     arg(line, 3), arg(line, 4));
+		else if (streq(command, "release"))
+			do_release(handle, arg(line, 1));
+		else if (streq(command, "dump"))
+			dump(handle);
+		else if (streq(command, "sleep"))
+			sleep(atoi(arg(line, 1)));
+		else
+			barf("Unknown command %s", command);
+		fflush(stdout);
+		alarm(0);
+	}
+	return 0;
+}
author	cl349@firebug.cl.cam.ac.uk <cl349@firebug.cl.cam.ac.uk>	2005-06-07 12:43:58 +0000
committer	cl349@firebug.cl.cam.ac.uk <cl349@firebug.cl.cam.ac.uk>	2005-06-07 12:43:58 +0000
commit	29c9e570b1eddfd6df789e08da65cf4ddec5f6fe (patch)
tree	bf79ad3040d05ee9e05a60df3b8a364fcfa236dc
parent	636a81e9701d001f4c9108f722014f48f59eabbd (diff)
download	xen-29c9e570b1eddfd6df789e08da65cf4ddec5f6fe.tar.gz xen-29c9e570b1eddfd6df789e08da65cf4ddec5f6fe.tar.bz2 xen-29c9e570b1eddfd6df789e08da65cf4ddec5f6fe.zip